public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, PR43513] Replace vla with array.
@ 2011-07-27 10:58 Tom de Vries
  2011-07-27 10:59 ` [PATCH PR43513, 1/3] Replace vla with array - Implementation Tom de Vries
                   ` (3 more replies)
  0 siblings, 4 replies; 37+ messages in thread
From: Tom de Vries @ 2011-07-27 10:58 UTC (permalink / raw)
  To: rguenther; +Cc: gcc-patches

Hi Richard,

I have a patch set for bug 43513 - The stack pointer is adjusted twice.

01_pr43513.3.patch
02_pr43513.3.test.patch
03_pr43513.3.mudflap.patch

The patch set has been bootstrapped and reg-tested on x86_64.

I will sent out the patches individually.

The patch replaces a vla __builtin_alloca that has a constant argument with an
array declaration.

The effect of the patch on the example from the bug report for
-march=armv7-a -mthumb -Os -fpic is a text size reduction from 72 to 64 bytes.
This is demonstrated in the following diff, where left is without and right is
with patch.

push	{r4, r5, r6, r7, r8, lr}      |	push	{r4, r5, r6, lr}
add	r7, sp, #0		      |	sub	sp, sp, #40
sub	sp, sp, #48		      <
ldr	r4, .L4				ldr	r4, .L4
mov	r0, sp				mov	r0, sp
ldr	r6, .L4+4		      <
bl	bar(PLT)			bl	bar(PLT)
ldr	r3, .L4+8		      |	ldr	r3, .L4+4
.LPIC8:					.LPIC8:
add	r4, pc				add	r4, pc
mov	r8, sp			      |	ldr	r6, .L4+8
movs	r5, #0				movs	r5, #0
				      >	ldr	r4, [r4, r3]
.LPIC9:					.LPIC9:
add	r6, pc				add	r6, pc
ldr	r4, [r4, r3]		      <
.L2:					.L2:
ldr	r2, [r8, r5]		      |	ldr	r2, [sp, r5]
mov	r1, r6				mov	r1, r6
ldr	r0, [r4, #0]			ldr	r0, [r4, #0]
adds	r5, r5, #4			adds	r5, r5, #4
bl	fprintf(PLT)			bl	fprintf(PLT)
cmp	r5, #40				cmp	r5, #40
bne	.L2				bne	.L2
mov	sp, r7			      |	add	sp, sp, #40
pop	{r4, r5, r6, r7, r8, pc}      |	pop	{r4, r5, r6, pc}
.L5:					.L5:
.align	2				.align	2
.L4:					.L4:
.word	_GLOBAL_OFFSET_TABLE_-(.LPIC8	.word	_GLOBAL_OFFSET_TABLE_-(.LPIC8
.word	.LC0-(.LPIC9+4)		      <
.word	stdout(GOT)			.word	stdout(GOT)
				      >	.word	.LC0-(.LPIC9+4)
.size	foo3, .-foo3			.size	foo3, .-foo3
.section	.rodata.str1.1,"aMS",	.section	.rodata.str1.1,"aMS",
.LC0:					.LC0:
.ascii	"%d \000"			.ascii	"%d \000"

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-27 10:58 [PATCH, PR43513] Replace vla with array Tom de Vries
@ 2011-07-27 10:59 ` Tom de Vries
  2011-07-27 11:40   ` Richard Guenther
  2011-07-27 11:00 ` [PATCH PR43513, 2/3] Replace vla with array - Test case Tom de Vries
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-07-27 10:59 UTC (permalink / raw)
  To: rguenther; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 690 bytes --]

On 07/27/2011 01:50 PM, Tom de Vries wrote:
> Hi Richard,
> 
> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
> 
> 01_pr43513.3.patch
> 02_pr43513.3.test.patch
> 03_pr43513.3.mudflap.patch
> 
> The patch set has been bootstrapped and reg-tested on x86_64.
> 
> I will sent out the patches individually.
> 

The patch replaces a vla __builtin_alloca that has a constant argument with an
array declaration.

OK for trunk?

Thanks,
- Tom

2011-07-27  Tom de Vries  <tom@codesourcery.com>

	PR middle-end/43513
	* builtins.c (fold_builtin_alloca): New function.
	* tree.h (fold_builtin_alloca): Declare.
	* gimple-fold.c (gimple_fold_builtin): Use fold_builtin_alloca.

[-- Attachment #2: 01_pr43513.3.patch --]
[-- Type: text/x-patch, Size: 2654 bytes --]

Index: gcc/builtins.c
===================================================================
--- gcc/builtins.c	(revision 175801)
+++ gcc/builtins.c	(working copy)
@@ -6193,6 +6193,43 @@ builtin_mathfn_code (const_tree t)
   return DECL_FUNCTION_CODE (fndecl);
 }
 
+/* Detects a vla-related alloca with a constant argument.  Declares fixed-size
+   array and return the address, if found, otherwise returns NULL_TREE.  */
+
+tree
+fold_builtin_alloca (tree lhs, tree arg)
+{
+  unsigned HOST_WIDE_INT size, n_elem, elem_size;
+  tree var_type, vla_type, elem_type, array_type;
+
+  if (lhs == NULL_TREE)
+    return NULL_TREE;
+
+  /* Detect constant argument.  */
+  if (TREE_CODE (arg) != INTEGER_CST || !host_integerp (arg, 1))
+    return NULL_TREE;
+  size = TREE_INT_CST_LOW (arg);
+
+  /* Detect a vla.  */
+  var_type = TREE_TYPE (SSA_NAME_VAR (lhs));
+  if (TREE_CODE (var_type) != POINTER_TYPE)
+    return NULL_TREE;
+  vla_type = TREE_TYPE (var_type);
+  if (TREE_CODE (vla_type) != ARRAY_TYPE)
+    return NULL_TREE;
+  if (TREE_CODE (TYPE_MAXVAL (TYPE_DOMAIN (vla_type))) == INTEGER_CST)
+    return NULL_TREE;
+  elem_type = TREE_TYPE (vla_type);
+  if (TREE_CODE (TYPE_SIZE_UNIT (elem_type)) != INTEGER_CST)
+    return NULL_TREE;
+
+  /* Declare a fixed-size array and return the address instead.  */
+  elem_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (elem_type));
+  n_elem = size / elem_size;
+  array_type = build_array_type_nelts (elem_type, n_elem);
+  return build_fold_addr_expr (create_tmp_var (array_type, "vla_cst"));
+}
+
 /* Fold a call to __builtin_constant_p, if we know its argument ARG will
    evaluate to a constant.  */
 
Index: gcc/tree.h
===================================================================
--- gcc/tree.h	(revision 175801)
+++ gcc/tree.h	(working copy)
@@ -5321,6 +5321,7 @@ truth_value_p (enum tree_code code)
 
 /* In builtins.c */
 extern tree fold_call_expr (location_t, tree, bool);
+extern tree fold_builtin_alloca (tree, tree);
 extern tree fold_builtin_fputs (location_t, tree, tree, bool, bool, tree);
 extern tree fold_builtin_strcpy (location_t, tree, tree, tree, tree);
 extern tree fold_builtin_strncpy (location_t, tree, tree, tree, tree, tree);
Index: gcc/gimple-fold.c
===================================================================
--- gcc/gimple-fold.c	(revision 175801)
+++ gcc/gimple-fold.c	(working copy)
@@ -1246,6 +1246,9 @@ gimple_fold_builtin (gimple stmt)
       arg_idx = 1;
       type = 2;
       break;
+    case BUILT_IN_ALLOCA:
+      return fold_builtin_alloca (gimple_call_lhs (stmt),
+				  gimple_call_arg (stmt, 0));
     default:
       return NULL_TREE;
     }

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH PR43513,  2/3] Replace vla with array - Test case.
  2011-07-27 10:58 [PATCH, PR43513] Replace vla with array Tom de Vries
  2011-07-27 10:59 ` [PATCH PR43513, 1/3] Replace vla with array - Implementation Tom de Vries
@ 2011-07-27 11:00 ` Tom de Vries
  2011-07-27 11:12   ` Tom de Vries
  2011-07-27 11:21 ` [PATCH PR43513, 3/3] Replace vla with array - Adapt mudflap testcase Tom de Vries
  2011-07-27 11:51 ` [PATCH, PR43513] Replace vla with array Jakub Jelinek
  3 siblings, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-07-27 11:00 UTC (permalink / raw)
  To: rguenther; +Cc: gcc-patches

On 07/27/2011 01:50 PM, Tom de Vries wrote:
> Hi Richard,
> 
> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
> 
> 01_pr43513.3.patch
> 02_pr43513.3.test.patch
> 03_pr43513.3.mudflap.patch
> 
> The patch set has been bootstrapped and reg-tested on x86_64.
> 
> I will sent out the patches individually.
> 

This patch adds the testcase from the bug report, modified not to
need includes.

OK for trunk?

Thanks,
- Tom

2011-07-27  Tom de Vries  <tom@codesourcery.com>

	PR middle-end/43513
	* gcc.dg/pr43513.c: New test.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  2/3] Replace vla with array - Test case.
  2011-07-27 11:00 ` [PATCH PR43513, 2/3] Replace vla with array - Test case Tom de Vries
@ 2011-07-27 11:12   ` Tom de Vries
  2011-07-30 10:39     ` Tom de Vries
  0 siblings, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-07-27 11:12 UTC (permalink / raw)
  To: rguenther; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 657 bytes --]

On 07/27/2011 01:54 PM, Tom de Vries wrote:
> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>> Hi Richard,
>>
>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>
>> 01_pr43513.3.patch
>> 02_pr43513.3.test.patch
>> 03_pr43513.3.mudflap.patch
>>
>> The patch set has been bootstrapped and reg-tested on x86_64.
>>
>> I will sent out the patches individually.
>>

Sorry, with patch this time.

> 
> This patch adds the testcase from the bug report, modified not to
> need includes.
> 
> OK for trunk?
> 
> Thanks,
> - Tom
> 
> 2011-07-27  Tom de Vries  <tom@codesourcery.com>
> 
> 	PR middle-end/43513
> 	* gcc.dg/pr43513.c: New test.


[-- Attachment #2: 02_pr43513.3.test.patch --]
[-- Type: text/x-patch, Size: 626 bytes --]

Index: gcc/testsuite/gcc.dg/pr43513.c
===================================================================
--- gcc/testsuite/gcc.dg/pr43513.c	(revision 0)
+++ gcc/testsuite/gcc.dg/pr43513.c	(revision 0)
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-ccp1" } */
+
+void bar (int *);
+void foo (char *, int);
+
+void
+foo3 ()
+{
+  const int kIterations = 10;
+  int results[kIterations];
+  int i;
+  bar (results);
+  for (i = 0; i < kIterations; i++)
+    foo ("%d ", results[i]);
+}
+
+/* { dg-final { scan-tree-dump-times "alloca" 0 "ccp1"} } */
+/* { dg-final { cleanup-tree-dump "ccp1" } } */

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH PR43513,  3/3] Replace vla with array - Adapt mudflap testcase.
  2011-07-27 10:58 [PATCH, PR43513] Replace vla with array Tom de Vries
  2011-07-27 10:59 ` [PATCH PR43513, 1/3] Replace vla with array - Implementation Tom de Vries
  2011-07-27 11:00 ` [PATCH PR43513, 2/3] Replace vla with array - Test case Tom de Vries
@ 2011-07-27 11:21 ` Tom de Vries
  2011-07-27 11:51 ` [PATCH, PR43513] Replace vla with array Jakub Jelinek
  3 siblings, 0 replies; 37+ messages in thread
From: Tom de Vries @ 2011-07-27 11:21 UTC (permalink / raw)
  To: rguenther; +Cc: gcc-patches, fche

[-- Attachment #1: Type: text/plain, Size: 1068 bytes --]

On 07/27/2011 01:50 PM, Tom de Vries wrote:
> Hi Richard,
> 
> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
> 
> 01_pr43513.3.patch
> 02_pr43513.3.test.patch
> 03_pr43513.3.mudflap.patch
> 
> The patch set has been bootstrapped and reg-tested on x86_64.
> 
> I will sent out the patches individually.
> 

The testcase
http://gcc.gnu.org/svn/gcc/trunk/libmudflap/testsuite/libmudflap.c/fail31-frag.c
contains a vla and a dead and illegal store to that vla.
Dead because the contents of the array is not read during it's lifetime, illegal
because it writes past the array end.

Normally, the store is not removed by pass_cd_dce because
is_hidden_global_store.  With the alloca folded into an array declaration, this
is no longer the case, the store is removed and mudflap no longer complains.

Adding the attribute ensures that the alloca is not folded.

OK for trunk?

Thanks,
- Tom

2011-07-27  Tom de Vries  <tom@codesourcery.com>

	PR middle-end/43513
	* testsuite/libmudflap.c/fail31-frag.c: Adapt testcase to prevent
	folding of alloca.

[-- Attachment #2: 03_pr43513.3.mudflap.patch --]
[-- Type: text/x-patch, Size: 378 bytes --]

Index: libmudflap/testsuite/libmudflap.c/fail31-frag.c
===================================================================
--- libmudflap/testsuite/libmudflap.c/fail31-frag.c	(revision 176554)
+++ libmudflap/testsuite/libmudflap.c/fail31-frag.c	(working copy)
@@ -9,6 +9,7 @@ int main ()
   return 0;
 }
 int *p;
+__attribute__((noinline))
 int h (int i, int j)
 {
   int k[i];

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-27 10:59 ` [PATCH PR43513, 1/3] Replace vla with array - Implementation Tom de Vries
@ 2011-07-27 11:40   ` Richard Guenther
  2011-07-27 14:37     ` Tom de Vries
  0 siblings, 1 reply; 37+ messages in thread
From: Richard Guenther @ 2011-07-27 11:40 UTC (permalink / raw)
  To: Tom de Vries; +Cc: gcc-patches

On Wed, 27 Jul 2011, Tom de Vries wrote:

> On 07/27/2011 01:50 PM, Tom de Vries wrote:
> > Hi Richard,
> > 
> > I have a patch set for bug 43513 - The stack pointer is adjusted twice.
> > 
> > 01_pr43513.3.patch
> > 02_pr43513.3.test.patch
> > 03_pr43513.3.mudflap.patch
> > 
> > The patch set has been bootstrapped and reg-tested on x86_64.
> > 
> > I will sent out the patches individually.
> > 
> 
> The patch replaces a vla __builtin_alloca that has a constant argument with an
> array declaration.
> 
> OK for trunk?

I don't think it is safe to try to get at the VLA type the way you do.
In fact I would simply do sth like

  elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
  n_elem = size * 8 / BITS_PER_UNIT;
  array_type = build_array_type_nelts (elem_type, n_elem);
  var = create_tmp_var (array_type, NULL);
  return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));

And obviously you lose the optimization we arrange with inserting
__builtin_stack_save/restore pairs that way - stack space will no
longer be shared for subsequent VLAs.  Which means that you'd
better limit the size you allow this promotion.

Alternatively this promotion could happen alongsize 
optimize_stack_restore using more global knowledge of the effects
on the maximum stack size this folding produces.

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH, PR43513] Replace vla with array.
  2011-07-27 10:58 [PATCH, PR43513] Replace vla with array Tom de Vries
                   ` (2 preceding siblings ...)
  2011-07-27 11:21 ` [PATCH PR43513, 3/3] Replace vla with array - Adapt mudflap testcase Tom de Vries
@ 2011-07-27 11:51 ` Jakub Jelinek
  3 siblings, 0 replies; 37+ messages in thread
From: Jakub Jelinek @ 2011-07-27 11:51 UTC (permalink / raw)
  To: Tom de Vries; +Cc: rguenther, gcc-patches

On Wed, Jul 27, 2011 at 01:50:47PM +0300, Tom de Vries wrote:
> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
> 
> 01_pr43513.3.patch
> 02_pr43513.3.test.patch
> 03_pr43513.3.mudflap.patch
> 
> The patch set has been bootstrapped and reg-tested on x86_64.
> 
> I will sent out the patches individually.
> 
> The patch replaces a vla __builtin_alloca that has a constant argument with an
> array declaration.

IMHO you don't want to do this unconditionally, for small sizes it might be
ok, but for larger sizes e.g. some unlikely code path might use a VLA for
very large allocation to keep the default stack usage in reasonable limits,
or to be able to reuse that stack for something else, etc.

	Jakub

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-27 11:40   ` Richard Guenther
@ 2011-07-27 14:37     ` Tom de Vries
  2011-07-27 14:52       ` Richard Guenther
  0 siblings, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-07-27 14:37 UTC (permalink / raw)
  To: Richard Guenther; +Cc: gcc-patches

On 07/27/2011 02:12 PM, Richard Guenther wrote:
> On Wed, 27 Jul 2011, Tom de Vries wrote:
> 
>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>> Hi Richard,
>>>
>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>
>>> 01_pr43513.3.patch
>>> 02_pr43513.3.test.patch
>>> 03_pr43513.3.mudflap.patch
>>>
>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>
>>> I will sent out the patches individually.
>>>
>>
>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>> array declaration.
>>
>> OK for trunk?
> 
> I don't think it is safe to try to get at the VLA type the way you do.

I don't understand in what way it's not safe. Do you mean I don't manage to find
the type always, or that I find the wrong type, or something else?

> In fact I would simply do sth like
> 
>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>   n_elem = size * 8 / BITS_PER_UNIT;
>   array_type = build_array_type_nelts (elem_type, n_elem);
>   var = create_tmp_var (array_type, NULL);
>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
> 

I tried this code on the example, and it works, but the newly declared type has
an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
the memory access in the example potentially unaligned, which prohibits an
ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
with my current patch.

> And obviously you lose the optimization we arrange with inserting
> __builtin_stack_save/restore pairs that way - stack space will no
> longer be shared for subsequent VLAs.  Which means that you'd
> better limit the size you allow this promotion.
> 

Right, I could introduce a parameter for this.

> Alternatively this promotion could happen alongsize 
> optimize_stack_restore using more global knowledge of the effects
> on the maximum stack size this folding produces.
> 

OK, I'll look into this.

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-27 14:37     ` Tom de Vries
@ 2011-07-27 14:52       ` Richard Guenther
  2011-07-27 16:12         ` Michael Matz
  2011-07-27 17:46         ` Tom de Vries
  0 siblings, 2 replies; 37+ messages in thread
From: Richard Guenther @ 2011-07-27 14:52 UTC (permalink / raw)
  To: Tom de Vries; +Cc: gcc-patches

On Wed, 27 Jul 2011, Tom de Vries wrote:

> On 07/27/2011 02:12 PM, Richard Guenther wrote:
> > On Wed, 27 Jul 2011, Tom de Vries wrote:
> > 
> >> On 07/27/2011 01:50 PM, Tom de Vries wrote:
> >>> Hi Richard,
> >>>
> >>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
> >>>
> >>> 01_pr43513.3.patch
> >>> 02_pr43513.3.test.patch
> >>> 03_pr43513.3.mudflap.patch
> >>>
> >>> The patch set has been bootstrapped and reg-tested on x86_64.
> >>>
> >>> I will sent out the patches individually.
> >>>
> >>
> >> The patch replaces a vla __builtin_alloca that has a constant argument with an
> >> array declaration.
> >>
> >> OK for trunk?
> > 
> > I don't think it is safe to try to get at the VLA type the way you do.
> 
> I don't understand in what way it's not safe. Do you mean I don't manage to find
> the type always, or that I find the wrong type, or something else?

I think you might get the wrong type, you also do not transform code
like

  int *p = alloca(4);
  *p = 3;

as there is no array type involved here.

> > In fact I would simply do sth like
> > 
> >   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
> >   n_elem = size * 8 / BITS_PER_UNIT;
> >   array_type = build_array_type_nelts (elem_type, n_elem);
> >   var = create_tmp_var (array_type, NULL);
> >   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
> > 
> 
> I tried this code on the example, and it works, but the newly declared type has
> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
> the memory access in the example potentially unaligned, which prohibits an
> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
> with my current patch.

Ok, so then set DECL_ALIGN of the variable to something reasonable
like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
alignment that the targets alloca function would guarantee.

> > And obviously you lose the optimization we arrange with inserting
> > __builtin_stack_save/restore pairs that way - stack space will no
> > longer be shared for subsequent VLAs.  Which means that you'd
> > better limit the size you allow this promotion.
> > 
> 
> Right, I could introduce a parameter for this.

I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
allow a size of PARAM_LARGE_STACK_FRAME / 10?

> > Alternatively this promotion could happen alongsize 
> > optimize_stack_restore using more global knowledge of the effects
> > on the maximum stack size this folding produces.
> > 
> 
> OK, I'll look into this.

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-27 14:52       ` Richard Guenther
@ 2011-07-27 16:12         ` Michael Matz
  2011-07-28  9:22           ` Richard Guenther
  2011-07-27 17:46         ` Tom de Vries
  1 sibling, 1 reply; 37+ messages in thread
From: Michael Matz @ 2011-07-27 16:12 UTC (permalink / raw)
  To: Richard Guenther; +Cc: Tom de Vries, gcc-patches

Hi,

On Wed, 27 Jul 2011, Richard Guenther wrote:

> > > I don't think it is safe to try to get at the VLA type the way you do.
> > 
> > I don't understand in what way it's not safe. Do you mean I don't manage to find
> > the type always, or that I find the wrong type, or something else?
> 
> I think you might get the wrong type, you also do not transform code
> like
> 
>   int *p = alloca(4);
>   *p = 3;
> 
> as there is no array type involved here.

That's good, because you _can't_ transform that code into an array decl.  
See:

   for (int i = 0; i < 100; i++)
     p[i] = alloca(4);
   assert (p[0] != p[1]);

vs.
   char vla_cst[4];
   for (int i = 0; i < 100; i++)
     p[i] = &vla_cst;
   assert (p[0] != p[1]);

Tom: you can reliably detect if an alloca call is for a VLA by checking 
CALL_ALLOCA_FOR_VAR_P (on a tree call expression, but only if it's a 
builtin call) or gimple_call_alloca_for_var_p (on a gimple call stmt).


Ciao,
Michael.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-27 14:52       ` Richard Guenther
  2011-07-27 16:12         ` Michael Matz
@ 2011-07-27 17:46         ` Tom de Vries
  2011-07-28  9:44           ` Richard Guenther
  1 sibling, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-07-27 17:46 UTC (permalink / raw)
  To: Richard Guenther; +Cc: gcc-patches

On 07/27/2011 05:27 PM, Richard Guenther wrote:
> On Wed, 27 Jul 2011, Tom de Vries wrote:
> 
>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>
>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>>>> Hi Richard,
>>>>>
>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>>>
>>>>> 01_pr43513.3.patch
>>>>> 02_pr43513.3.test.patch
>>>>> 03_pr43513.3.mudflap.patch
>>>>>
>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>>>
>>>>> I will sent out the patches individually.
>>>>>
>>>>
>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>>>> array declaration.
>>>>
>>>> OK for trunk?
>>>
>>> I don't think it is safe to try to get at the VLA type the way you do.
>>
>> I don't understand in what way it's not safe. Do you mean I don't manage to find
>> the type always, or that I find the wrong type, or something else?
> 
> I think you might get the wrong type,

Ok, I'll review that code one more time.

> you also do not transform code
> like
> 
>   int *p = alloca(4);
>   *p = 3;
> 
> as there is no array type involved here.
> 

I was trying to stay away from non-vla allocas.  A source declared alloca has
function livetime, so we could have a single alloca in a loop, called 10 times,
with all 10 instances live at the same time. This patch does not detect such
cases, and thus stays away from non-vla allocas. A vla decl does not have such
problems, the lifetime ends when it goes out of scope.

>>> In fact I would simply do sth like
>>>
>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>   var = create_tmp_var (array_type, NULL);
>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
>>>
>>
>> I tried this code on the example, and it works, but the newly declared type has
>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
>> the memory access in the example potentially unaligned, which prohibits an
>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
>> with my current patch.
> 
> Ok, so then set DECL_ALIGN of the variable to something reasonable
> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
> alignment that the targets alloca function would guarantee.
> 

I tried that, but that doesn't help. It's the alignment of the type that
matters, not of the decl.

So should we try to find the base type of the vla, and use that, or use the
nonstandard char type?

>>> And obviously you lose the optimization we arrange with inserting
>>> __builtin_stack_save/restore pairs that way - stack space will no
>>> longer be shared for subsequent VLAs.  Which means that you'd
>>> better limit the size you allow this promotion.
>>>
>>
>> Right, I could introduce a parameter for this.
> 
> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
> allow a size of PARAM_LARGE_STACK_FRAME / 10?
> 

That unfortunately is too small for the example from bug report. The default
value of the param is 250, so that would be a threshold of 25, and the alloca
size of the example is 40.  Perhaps we can try a threshold of
PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?

>>> Alternatively this promotion could happen alongsize 
>>> optimize_stack_restore using more global knowledge of the effects
>>> on the maximum stack size this folding produces.
>>>
>>
>> OK, I'll look into this.
> 

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-27 16:12         ` Michael Matz
@ 2011-07-28  9:22           ` Richard Guenther
  0 siblings, 0 replies; 37+ messages in thread
From: Richard Guenther @ 2011-07-28  9:22 UTC (permalink / raw)
  To: Michael Matz; +Cc: Tom de Vries, gcc-patches

On Wed, 27 Jul 2011, Michael Matz wrote:

> Hi,
> 
> On Wed, 27 Jul 2011, Richard Guenther wrote:
> 
> > > > I don't think it is safe to try to get at the VLA type the way you do.
> > > 
> > > I don't understand in what way it's not safe. Do you mean I don't manage to find
> > > the type always, or that I find the wrong type, or something else?
> > 
> > I think you might get the wrong type, you also do not transform code
> > like
> > 
> >   int *p = alloca(4);
> >   *p = 3;
> > 
> > as there is no array type involved here.
> 
> That's good, because you _can't_ transform that code into an array decl.  
> See:
> 
>    for (int i = 0; i < 100; i++)
>      p[i] = alloca(4);
>    assert (p[0] != p[1]);
> 
> vs.
>    char vla_cst[4];
>    for (int i = 0; i < 100; i++)
>      p[i] = &vla_cst;
>    assert (p[0] != p[1]);

Hm, indeed ;)  At least not without more flow-sensitive analysis.

> Tom: you can reliably detect if an alloca call is for a VLA by checking 
> CALL_ALLOCA_FOR_VAR_P (on a tree call expression, but only if it's a 
> builtin call) or gimple_call_alloca_for_var_p (on a gimple call stmt).

Which actually hints that you should inline the folder into
gimple_fold_call/builtin where you still have this flag properly 
preserved.

Richard.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-27 17:46         ` Tom de Vries
@ 2011-07-28  9:44           ` Richard Guenther
  2011-07-29 12:09             ` Tom de Vries
  0 siblings, 1 reply; 37+ messages in thread
From: Richard Guenther @ 2011-07-28  9:44 UTC (permalink / raw)
  To: Tom de Vries; +Cc: gcc-patches

On Wed, 27 Jul 2011, Tom de Vries wrote:

> On 07/27/2011 05:27 PM, Richard Guenther wrote:
> > On Wed, 27 Jul 2011, Tom de Vries wrote:
> > 
> >> On 07/27/2011 02:12 PM, Richard Guenther wrote:
> >>> On Wed, 27 Jul 2011, Tom de Vries wrote:
> >>>
> >>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
> >>>>> Hi Richard,
> >>>>>
> >>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
> >>>>>
> >>>>> 01_pr43513.3.patch
> >>>>> 02_pr43513.3.test.patch
> >>>>> 03_pr43513.3.mudflap.patch
> >>>>>
> >>>>> The patch set has been bootstrapped and reg-tested on x86_64.
> >>>>>
> >>>>> I will sent out the patches individually.
> >>>>>
> >>>>
> >>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
> >>>> array declaration.
> >>>>
> >>>> OK for trunk?
> >>>
> >>> I don't think it is safe to try to get at the VLA type the way you do.
> >>
> >> I don't understand in what way it's not safe. Do you mean I don't manage to find
> >> the type always, or that I find the wrong type, or something else?
> > 
> > I think you might get the wrong type,
> 
> Ok, I'll review that code one more time.
> 
> > you also do not transform code
> > like
> > 
> >   int *p = alloca(4);
> >   *p = 3;
> > 
> > as there is no array type involved here.
> > 
> 
> I was trying to stay away from non-vla allocas.  A source declared alloca has
> function livetime, so we could have a single alloca in a loop, called 10 times,
> with all 10 instances live at the same time. This patch does not detect such
> cases, and thus stays away from non-vla allocas. A vla decl does not have such
> problems, the lifetime ends when it goes out of scope.

Yes indeed - that probably would require more detailed analysis.

> >>> In fact I would simply do sth like
> >>>
> >>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
> >>>   n_elem = size * 8 / BITS_PER_UNIT;
> >>>   array_type = build_array_type_nelts (elem_type, n_elem);
> >>>   var = create_tmp_var (array_type, NULL);
> >>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
> >>>
> >>
> >> I tried this code on the example, and it works, but the newly declared type has
> >> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
> >> the memory access in the example potentially unaligned, which prohibits an
> >> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
> >> with my current patch.
> > 
> > Ok, so then set DECL_ALIGN of the variable to something reasonable
> > like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
> > alignment that the targets alloca function would guarantee.
> > 
> 
> I tried that, but that doesn't help. It's the alignment of the type that
> matters, not of the decl.

It shouldn't.  All accesses are performed with the original types and
alignment comes from that (plus the underlying decl).

> So should we try to find the base type of the vla, and use that, or use the
> nonstandard char type?

I don't think we can reliably find the base type of the vla - well,
in practice we may because we control how we lower VLAs during
gimplification, but nothing in the IL constraints say that the
resulting pointer type should be special.

Using a char[] decl shouldn't be a problem IMHO.

> >>> And obviously you lose the optimization we arrange with inserting
> >>> __builtin_stack_save/restore pairs that way - stack space will no
> >>> longer be shared for subsequent VLAs.  Which means that you'd
> >>> better limit the size you allow this promotion.
> >>>
> >>
> >> Right, I could introduce a parameter for this.
> > 
> > I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
> > allow a size of PARAM_LARGE_STACK_FRAME / 10?
> > 
> 
> That unfortunately is too small for the example from bug report. The default
> value of the param is 250, so that would be a threshold of 25, and the alloca
> size of the example is 40.  Perhaps we can try a threshold of
> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?

Hm.  estimated_stack_size is not O(1), so no.  I think we need to
find a sensible way of allowing stack sharing.  Eventually Michas
patch for introducing points-of-death would help here, if we'd
go for folding this during stack-save/restore optimization.

Richard.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-29 12:09             ` Tom de Vries
@ 2011-07-28 16:00               ` Richard Guenther
  2011-07-28 18:09                 ` Tom de Vries
  0 siblings, 1 reply; 37+ messages in thread
From: Richard Guenther @ 2011-07-28 16:00 UTC (permalink / raw)
  To: Tom de Vries; +Cc: gcc-patches

On Thu, 28 Jul 2011, Tom de Vries wrote:

> On 07/28/2011 12:22 PM, Richard Guenther wrote:
> > On Wed, 27 Jul 2011, Tom de Vries wrote:
> > 
> >> On 07/27/2011 05:27 PM, Richard Guenther wrote:
> >>> On Wed, 27 Jul 2011, Tom de Vries wrote:
> >>>
> >>>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
> >>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
> >>>>>
> >>>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
> >>>>>>> Hi Richard,
> >>>>>>>
> >>>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
> >>>>>>>
> >>>>>>> 01_pr43513.3.patch
> >>>>>>> 02_pr43513.3.test.patch
> >>>>>>> 03_pr43513.3.mudflap.patch
> >>>>>>>
> >>>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
> >>>>>>>
> >>>>>>> I will sent out the patches individually.
> >>>>>>>
> >>>>>>
> >>>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
> >>>>>> array declaration.
> >>>>>>
> >>>>>> OK for trunk?
> >>>>>
> >>>>> I don't think it is safe to try to get at the VLA type the way you do.
> >>>>
> >>>> I don't understand in what way it's not safe. Do you mean I don't manage to find
> >>>> the type always, or that I find the wrong type, or something else?
> >>>
> >>> I think you might get the wrong type,
> >>
> >> Ok, I'll review that code one more time.
> >>
> >>> you also do not transform code
> >>> like
> >>>
> >>>   int *p = alloca(4);
> >>>   *p = 3;
> >>>
> >>> as there is no array type involved here.
> >>>
> >>
> >> I was trying to stay away from non-vla allocas.  A source declared alloca has
> >> function livetime, so we could have a single alloca in a loop, called 10 times,
> >> with all 10 instances live at the same time. This patch does not detect such
> >> cases, and thus stays away from non-vla allocas. A vla decl does not have such
> >> problems, the lifetime ends when it goes out of scope.
> > 
> > Yes indeed - that probably would require more detailed analysis.
> > 
> >>>>> In fact I would simply do sth like
> >>>>>
> >>>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
> >>>>>   n_elem = size * 8 / BITS_PER_UNIT;
> >>>>>   array_type = build_array_type_nelts (elem_type, n_elem);
> >>>>>   var = create_tmp_var (array_type, NULL);
> >>>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
> >>>>>
> >>>>
> >>>> I tried this code on the example, and it works, but the newly declared type has
> >>>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
> >>>> the memory access in the example potentially unaligned, which prohibits an
> >>>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
> >>>> with my current patch.
> >>>
> >>> Ok, so then set DECL_ALIGN of the variable to something reasonable
> >>> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
> >>> alignment that the targets alloca function would guarantee.
> >>>
> >>
> >> I tried that, but that doesn't help. It's the alignment of the type that
> >> matters, not of the decl.
> > 
> > It shouldn't.  All accesses are performed with the original types and
> > alignment comes from that (plus the underlying decl).
> > 
> 
> I managed to get it all working by using build_aligned_type rather that DECL_ALIGN.

That's really odd, DECL_ALIGN should just work - nothing refers to the
type of the decl in the IL.  Can you try also setting DECL_USER_ALIGN to 
1 maybe?

> 
> >> So should we try to find the base type of the vla, and use that, or use the
> >> nonstandard char type?
> > 
> > I don't think we can reliably find the base type of the vla - well,
> > in practice we may because we control how we lower VLAs during
> > gimplification, but nothing in the IL constraints say that the
> > resulting pointer type should be special.
> > 
> > Using a char[] decl shouldn't be a problem IMHO.
> > 
> >>>>> And obviously you lose the optimization we arrange with inserting
> >>>>> __builtin_stack_save/restore pairs that way - stack space will no
> >>>>> longer be shared for subsequent VLAs.  Which means that you'd
> >>>>> better limit the size you allow this promotion.
> >>>>>
> >>>>
> >>>> Right, I could introduce a parameter for this.
> >>>
> >>> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
> >>> allow a size of PARAM_LARGE_STACK_FRAME / 10?
> >>>
> >>
> >> That unfortunately is too small for the example from bug report. The default
> >> value of the param is 250, so that would be a threshold of 25, and the alloca
> >> size of the example is 40.  Perhaps we can try a threshold of
> >> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?
> > 
> > Hm.  estimated_stack_size is not O(1), so no.  I think we need to
> > find a sensible way of allowing stack sharing.  Eventually Michas
> > patch for introducing points-of-death would help here, if we'd
> > go for folding this during stack-save/restore optimization.
> > 
> 
> I changed the heuristics to this:
> 
> +  /* Heuristic: don't fold large vlas.  */
> +  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
> +  /* In case a vla is declared at function scope, it has the same lifetime as a
> +     declared array, so we allow a larger size.  */
> +  block = gimple_block (stmt);
> +  if (!(cfun->after_inlining
> +        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
> +    threshold /= 10;
> +  if (size > threshold)
> +    return NULL_TREE;
> 
> The heuristics distinguishes between before and after inlining.
> 
> After inlining, vla's declared at function scope have the same lifetimes as
> declared arrays, and don't share their space. There should be no negative
> effects from folding an alloca in this case, but for safety we set a threshold
> of PARAM_LARGE_STACK_FRAME.
> 
> Before inlining, such a vla might be inlined and share its space with another
> vla, so we stick with the normal threshold before inlining.

That sounds reasonable, though the block check should probably use the
original VLA decl block, not that of the basic-block of the allocation,
but unfortunately we don't have access to that.  So I suppose using
the allocation basic-block BLOCK is good enough (still we don't
really care about BLOCK boundaries when doing CFG manipulations, so
the allocation bbs block may be not the outermost scope in more cases
than necessary).

> However, using this heuristic we still don't generate optimal code.
> 
> During the first pass_ccp, the folding is not done, because the size (40) is
> larger than the threshold 25. The threshold is 25, because inlining is not yet done.
> 
> During pass_fold_builtins, the folding is done because it's after inlining, but
> it's later than pass_iv_optimize, so that still doesn't yield the optimal size
> of 64.
> 
> The folding is not done during any of the other invocations or pass_ccp, because
> the argument has already become constant in the earlier invocation.

Yeah, that's the issue with relying on folding to do this transformation.

> Using this change, I manage to trigger folding during the second invocation of
> pass_ccp, before iv_optimize so we generate optimal code.
> 
> Index: gcc/tree-ssa-ccp.c
> ===================================================================
> --- gcc/tree-ssa-ccp.c (revision 173734)
> +++ gcc/tree-ssa-ccp.c (working copy)
> @@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
>  	if (gimple_call_internal_p (stmt))
>  	  return false;
> 
> +        /* The heuristic of fold_builtin_alloca differs before and after
> +           inlining, so we don't require the arg to be changed into a constant
> +           for folding, but just to be constant.  */
> +        if (gimple_call_alloca_for_var_p (stmt)
> +            && get_constant_value (gimple_call_arg (stmt, 0)))

Probably reverse the get_constant_value check and the transformation
(gimple_call_alloca_for_var_p isn't a predicate as it has side-effects,
so its name should be changed).

> +          return true;
> +
>  	/* Propagate into the call arguments.  Compared to replace_uses_in
>  	   this can use the argument slot types for type verification
>  	   instead of the current argument type.  We also can safely
> 
> But, to me it feels like a hack. Do you have any ideas how to do this better?

It's somewhat of a hack, but at least it is more of a defined place
for this transformation - which then suggests to remove it from
generic folding and only keep calling it from CCP this way.

Richard.

> Attaching untested patch for reference (will test overnight).
> 
> Thanks,
> - Tom
> 
> 2011-07-28  Tom de Vries  <tom@codesourcery.com>
> 
> 	PR middle-end/43513
> 	* gimple-fold.c (params.h): Include.
> 	(fold_builtin_alloca): New function.
> 	(gimple_fold_builtin): Use fold_builtin_alloca.
> 	* tree-ssa-ccp.c (ccp_fold_stmt): Force folding of vla-related alloca.
> 	* Makefile.in (gimple-fold.o): Add $(PARAMS_H) to rule.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-28 16:00               ` Richard Guenther
@ 2011-07-28 18:09                 ` Tom de Vries
  2011-07-28 19:38                   ` Richard Guenther
  2011-07-30  9:01                   ` Tom de Vries
  0 siblings, 2 replies; 37+ messages in thread
From: Tom de Vries @ 2011-07-28 18:09 UTC (permalink / raw)
  To: Richard Guenther; +Cc: gcc-patches, Zdenek Dvorak

On 07/28/2011 06:25 PM, Richard Guenther wrote:
> On Thu, 28 Jul 2011, Tom de Vries wrote:
> 
>> On 07/28/2011 12:22 PM, Richard Guenther wrote:
>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>
>>>> On 07/27/2011 05:27 PM, Richard Guenther wrote:
>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>
>>>>>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>
>>>>>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>>>>>>>> Hi Richard,
>>>>>>>>>
>>>>>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>>>>>>>
>>>>>>>>> 01_pr43513.3.patch
>>>>>>>>> 02_pr43513.3.test.patch
>>>>>>>>> 03_pr43513.3.mudflap.patch
>>>>>>>>>
>>>>>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>>>>>>>
>>>>>>>>> I will sent out the patches individually.
>>>>>>>>>
>>>>>>>>
>>>>>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>>>>>>>> array declaration.
>>>>>>>>
>>>>>>>> OK for trunk?
>>>>>>>
>>>>>>> I don't think it is safe to try to get at the VLA type the way you do.
>>>>>>
>>>>>> I don't understand in what way it's not safe. Do you mean I don't manage to find
>>>>>> the type always, or that I find the wrong type, or something else?
>>>>>
>>>>> I think you might get the wrong type,
>>>>
>>>> Ok, I'll review that code one more time.
>>>>
>>>>> you also do not transform code
>>>>> like
>>>>>
>>>>>   int *p = alloca(4);
>>>>>   *p = 3;
>>>>>
>>>>> as there is no array type involved here.
>>>>>
>>>>
>>>> I was trying to stay away from non-vla allocas.  A source declared alloca has
>>>> function livetime, so we could have a single alloca in a loop, called 10 times,
>>>> with all 10 instances live at the same time. This patch does not detect such
>>>> cases, and thus stays away from non-vla allocas. A vla decl does not have such
>>>> problems, the lifetime ends when it goes out of scope.
>>>
>>> Yes indeed - that probably would require more detailed analysis.
>>>
>>>>>>> In fact I would simply do sth like
>>>>>>>
>>>>>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>>>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>>>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>>>>>   var = create_tmp_var (array_type, NULL);
>>>>>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
>>>>>>>
>>>>>>
>>>>>> I tried this code on the example, and it works, but the newly declared type has
>>>>>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
>>>>>> the memory access in the example potentially unaligned, which prohibits an
>>>>>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
>>>>>> with my current patch.
>>>>>
>>>>> Ok, so then set DECL_ALIGN of the variable to something reasonable
>>>>> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
>>>>> alignment that the targets alloca function would guarantee.
>>>>>
>>>>
>>>> I tried that, but that doesn't help. It's the alignment of the type that
>>>> matters, not of the decl.
>>>
>>> It shouldn't.  All accesses are performed with the original types and
>>> alignment comes from that (plus the underlying decl).
>>>
>>
>> I managed to get it all working by using build_aligned_type rather that DECL_ALIGN.
> 
> That's really odd, DECL_ALIGN should just work - nothing refers to the
> type of the decl in the IL.  Can you try also setting DECL_USER_ALIGN to 
> 1 maybe?
> 

This doesn't work either.

  /* Declare array.  */
  elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
  n_elem = size * 8 / BITS_PER_UNIT;
  align = MIN (size * 8, GET_MODE_PRECISION (word_mode));
  array_type = build_array_type_nelts (elem_type, n_elem);
  var = create_tmp_var (array_type, NULL);
  DECL_ALIGN (var) = align;
  DECL_USER_ALIGN (var) = 1;

Maybe this clarifies it:

Breakpoint 1, may_be_unaligned_p (ref=0xf7d9d410, step=0xf7d3d578) at
/home/vries/local/google/src/gcc-mainline/gcc/tree-ssa-loop-ivopts.c:1621
(gdb) call debug_generic_expr (ref)
MEM[(int[0:D.2579] *)&D.2595][0]
(gdb) call debug_generic_expr (step)
4

1627	  base = get_inner_reference (ref, &bitsize, &bitpos, &toffset, &mode,
(gdb) call debug_generic_expr (base)
D.2595

1629	  base_type = TREE_TYPE (base);
(gdb) call debug_generic_expr (base_type)
<unnamed-unsigned:8>[40]

1630	  base_align = TYPE_ALIGN (base_type);
(gdb) p base_align
$1 = 8

So the align is 8-bits, and we return true here:

(gdb) n
1632	  if (mode != BLKmode)
(gdb) n
1634	      unsigned mode_align = GET_MODE_ALIGNMENT (mode);
(gdb)
1636	      if (base_align < mode_align
(gdb)
1639		return true;


Here we can see that the base actually has the (user) align on it:

(gdb) call debug_tree (base)
 <var_decl 0xf7e1b420 D.2595
    type <array_type 0xf7e1b360
        type <integer_type 0xf7e1b2a0 public unsigned QI
            size <integer_cst 0xf7d3d604 constant 8>
            unit size <integer_cst 0xf7d3d620 constant 1>
            align 8 symtab 0 alias set -1 canonical type 0xf7e1b2a0 precision 8
            min <integer_cst 0xf7dffaf0 0> max <integer_cst 0xf7dffb0c 255>
            pointer_to_this <pointer_type 0xf7e1b3c0>>
        BLK
        size <integer_cst 0xf7d5d070 constant 320>
        unit size <integer_cst 0xf7dde2a0 constant 40>
        align 8 symtab 0 alias set -1 canonical type 0xf7e1b360
        domain <integer_type 0xf7de12a0
                type <integer_type 0xf7d51000 unsigned int>
            SI
            size <integer_cst 0xf7d3d78c constant 32>
            unit size <integer_cst 0xf7d3d578 constant 4>
            align 32 symtab 0 alias set -1 canonical type 0xf7de12a0
            precision 32 min <integer_cst 0xf7d3d594 0>
            max <integer_cst 0xf7dde284 39>>
        pointer_to_this <pointer_type 0xf7e1b480>>
    addressable used ignored BLK file pr43513.c line 4 col 6
    size <integer_cst 0xf7d5d070 320> unit size <integer_cst 0xf7dde2a0 40>
    user align 32 context <function_decl 0xf7dfd480 foo3>>


>>
>>>> So should we try to find the base type of the vla, and use that, or use the
>>>> nonstandard char type?
>>>
>>> I don't think we can reliably find the base type of the vla - well,
>>> in practice we may because we control how we lower VLAs during
>>> gimplification, but nothing in the IL constraints say that the
>>> resulting pointer type should be special.
>>>
>>> Using a char[] decl shouldn't be a problem IMHO.
>>>
>>>>>>> And obviously you lose the optimization we arrange with inserting
>>>>>>> __builtin_stack_save/restore pairs that way - stack space will no
>>>>>>> longer be shared for subsequent VLAs.  Which means that you'd
>>>>>>> better limit the size you allow this promotion.
>>>>>>>
>>>>>>
>>>>>> Right, I could introduce a parameter for this.
>>>>>
>>>>> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
>>>>> allow a size of PARAM_LARGE_STACK_FRAME / 10?
>>>>>
>>>>
>>>> That unfortunately is too small for the example from bug report. The default
>>>> value of the param is 250, so that would be a threshold of 25, and the alloca
>>>> size of the example is 40.  Perhaps we can try a threshold of
>>>> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?
>>>
>>> Hm.  estimated_stack_size is not O(1), so no.  I think we need to
>>> find a sensible way of allowing stack sharing.  Eventually Michas
>>> patch for introducing points-of-death would help here, if we'd
>>> go for folding this during stack-save/restore optimization.
>>>
>>
>> I changed the heuristics to this:
>>
>> +  /* Heuristic: don't fold large vlas.  */
>> +  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
>> +  /* In case a vla is declared at function scope, it has the same lifetime as a
>> +     declared array, so we allow a larger size.  */
>> +  block = gimple_block (stmt);
>> +  if (!(cfun->after_inlining
>> +        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
>> +    threshold /= 10;
>> +  if (size > threshold)
>> +    return NULL_TREE;
>>
>> The heuristics distinguishes between before and after inlining.
>>
>> After inlining, vla's declared at function scope have the same lifetimes as
>> declared arrays, and don't share their space. There should be no negative
>> effects from folding an alloca in this case, but for safety we set a threshold
>> of PARAM_LARGE_STACK_FRAME.
>>
>> Before inlining, such a vla might be inlined and share its space with another
>> vla, so we stick with the normal threshold before inlining.
> 
> That sounds reasonable, though the block check should probably use the
> original VLA decl block, not that of the basic-block of the allocation,
> but unfortunately we don't have access to that.  So I suppose using
> the allocation basic-block BLOCK is good enough (still we don't
> really care about BLOCK boundaries when doing CFG manipulations, so
> the allocation bbs block may be not the outermost scope in more cases
> than necessary).
> 
>> However, using this heuristic we still don't generate optimal code.
>>
>> During the first pass_ccp, the folding is not done, because the size (40) is
>> larger than the threshold 25. The threshold is 25, because inlining is not yet done.
>>
>> During pass_fold_builtins, the folding is done because it's after inlining, but
>> it's later than pass_iv_optimize, so that still doesn't yield the optimal size
>> of 64.
>>
>> The folding is not done during any of the other invocations or pass_ccp, because
>> the argument has already become constant in the earlier invocation.
> 
> Yeah, that's the issue with relying on folding to do this transformation.
> 
>> Using this change, I manage to trigger folding during the second invocation of
>> pass_ccp, before iv_optimize so we generate optimal code.
>>
>> Index: gcc/tree-ssa-ccp.c
>> ===================================================================
>> --- gcc/tree-ssa-ccp.c (revision 173734)
>> +++ gcc/tree-ssa-ccp.c (working copy)
>> @@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
>>  	if (gimple_call_internal_p (stmt))
>>  	  return false;
>>
>> +        /* The heuristic of fold_builtin_alloca differs before and after
>> +           inlining, so we don't require the arg to be changed into a constant
>> +           for folding, but just to be constant.  */
>> +        if (gimple_call_alloca_for_var_p (stmt)
>> +            && get_constant_value (gimple_call_arg (stmt, 0)))
> 
> Probably reverse the get_constant_value check and the transformation

Done.

> (gimple_call_alloca_for_var_p isn't a predicate as it has side-effects,
> so its name should be changed).
> 
>> +          return true;
>> +
>>  	/* Propagate into the call arguments.  Compared to replace_uses_in
>>  	   this can use the argument slot types for type verification
>>  	   instead of the current argument type.  We also can safely
>>
>> But, to me it feels like a hack. Do you have any ideas how to do this better?
> 
> It's somewhat of a hack, but at least it is more of a defined place
> for this transformation - which then suggests to remove it from
> generic folding and only keep calling it from CCP this way.
> 

Done.

> Richard.
> 
>> Attaching untested patch for reference (will test overnight).
>>
>> Thanks,
>> - Tom
>>
>> 2011-07-28  Tom de Vries  <tom@codesourcery.com>
>>
>> 	PR middle-end/43513
>> 	* gimple-fold.c (params.h): Include.
>> 	(fold_builtin_alloca): New function.
>> 	(gimple_fold_builtin): Use fold_builtin_alloca.
>> 	* tree-ssa-ccp.c (ccp_fold_stmt): Force folding of vla-related alloca.
>> 	* Makefile.in (gimple-fold.o): Add $(PARAMS_H) to rule.

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-07-28 18:09                 ` Tom de Vries
@ 2011-07-28 19:38                   ` Richard Guenther
  2011-07-30  9:01                   ` Tom de Vries
  1 sibling, 0 replies; 37+ messages in thread
From: Richard Guenther @ 2011-07-28 19:38 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Guenther, gcc-patches, Zdenek Dvorak

On Thu, Jul 28, 2011 at 7:20 PM, Tom de Vries <vries@codesourcery.com> wrote:
> On 07/28/2011 06:25 PM, Richard Guenther wrote:
>> On Thu, 28 Jul 2011, Tom de Vries wrote:
>>
>>> On 07/28/2011 12:22 PM, Richard Guenther wrote:
>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>
>>>>> On 07/27/2011 05:27 PM, Richard Guenther wrote:
>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>
>>>>>>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
>>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>>
>>>>>>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>>>>>>>>> Hi Richard,
>>>>>>>>>>
>>>>>>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>>>>>>>>
>>>>>>>>>> 01_pr43513.3.patch
>>>>>>>>>> 02_pr43513.3.test.patch
>>>>>>>>>> 03_pr43513.3.mudflap.patch
>>>>>>>>>>
>>>>>>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>>>>>>>>
>>>>>>>>>> I will sent out the patches individually.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>>>>>>>>> array declaration.
>>>>>>>>>
>>>>>>>>> OK for trunk?
>>>>>>>>
>>>>>>>> I don't think it is safe to try to get at the VLA type the way you do.
>>>>>>>
>>>>>>> I don't understand in what way it's not safe. Do you mean I don't manage to find
>>>>>>> the type always, or that I find the wrong type, or something else?
>>>>>>
>>>>>> I think you might get the wrong type,
>>>>>
>>>>> Ok, I'll review that code one more time.
>>>>>
>>>>>> you also do not transform code
>>>>>> like
>>>>>>
>>>>>>   int *p = alloca(4);
>>>>>>   *p = 3;
>>>>>>
>>>>>> as there is no array type involved here.
>>>>>>
>>>>>
>>>>> I was trying to stay away from non-vla allocas.  A source declared alloca has
>>>>> function livetime, so we could have a single alloca in a loop, called 10 times,
>>>>> with all 10 instances live at the same time. This patch does not detect such
>>>>> cases, and thus stays away from non-vla allocas. A vla decl does not have such
>>>>> problems, the lifetime ends when it goes out of scope.
>>>>
>>>> Yes indeed - that probably would require more detailed analysis.
>>>>
>>>>>>>> In fact I would simply do sth like
>>>>>>>>
>>>>>>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>>>>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>>>>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>>>>>>   var = create_tmp_var (array_type, NULL);
>>>>>>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
>>>>>>>>
>>>>>>>
>>>>>>> I tried this code on the example, and it works, but the newly declared type has
>>>>>>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
>>>>>>> the memory access in the example potentially unaligned, which prohibits an
>>>>>>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
>>>>>>> with my current patch.
>>>>>>
>>>>>> Ok, so then set DECL_ALIGN of the variable to something reasonable
>>>>>> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
>>>>>> alignment that the targets alloca function would guarantee.
>>>>>>
>>>>>
>>>>> I tried that, but that doesn't help. It's the alignment of the type that
>>>>> matters, not of the decl.
>>>>
>>>> It shouldn't.  All accesses are performed with the original types and
>>>> alignment comes from that (plus the underlying decl).
>>>>
>>>
>>> I managed to get it all working by using build_aligned_type rather that DECL_ALIGN.
>>
>> That's really odd, DECL_ALIGN should just work - nothing refers to the
>> type of the decl in the IL.  Can you try also setting DECL_USER_ALIGN to
>> 1 maybe?
>>
>
> This doesn't work either.
>
>  /* Declare array.  */
>  elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>  n_elem = size * 8 / BITS_PER_UNIT;
>  align = MIN (size * 8, GET_MODE_PRECISION (word_mode));
>  array_type = build_array_type_nelts (elem_type, n_elem);
>  var = create_tmp_var (array_type, NULL);
>  DECL_ALIGN (var) = align;
>  DECL_USER_ALIGN (var) = 1;
>
> Maybe this clarifies it:
>
> Breakpoint 1, may_be_unaligned_p (ref=0xf7d9d410, step=0xf7d3d578) at
> /home/vries/local/google/src/gcc-mainline/gcc/tree-ssa-loop-ivopts.c:1621
> (gdb) call debug_generic_expr (ref)
> MEM[(int[0:D.2579] *)&D.2595][0]
> (gdb) call debug_generic_expr (step)
> 4
>
> 1627      base = get_inner_reference (ref, &bitsize, &bitpos, &toffset, &mode,
> (gdb) call debug_generic_expr (base)
> D.2595
>
> 1629      base_type = TREE_TYPE (base);
> (gdb) call debug_generic_expr (base_type)
> <unnamed-unsigned:8>[40]
>
> 1630      base_align = TYPE_ALIGN (base_type);
> (gdb) p base_align
> $1 = 8
>
> So the align is 8-bits, and we return true here:

Ah, but this code should use get_object_alignment, not solely look
at the type.

Richard.

> (gdb) n
> 1632      if (mode != BLKmode)
> (gdb) n
> 1634          unsigned mode_align = GET_MODE_ALIGNMENT (mode);
> (gdb)
> 1636          if (base_align < mode_align
> (gdb)
> 1639            return true;
>
>
> Here we can see that the base actually has the (user) align on it:
>
> (gdb) call debug_tree (base)
>  <var_decl 0xf7e1b420 D.2595
>    type <array_type 0xf7e1b360
>        type <integer_type 0xf7e1b2a0 public unsigned QI
>            size <integer_cst 0xf7d3d604 constant 8>
>            unit size <integer_cst 0xf7d3d620 constant 1>
>            align 8 symtab 0 alias set -1 canonical type 0xf7e1b2a0 precision 8
>            min <integer_cst 0xf7dffaf0 0> max <integer_cst 0xf7dffb0c 255>
>            pointer_to_this <pointer_type 0xf7e1b3c0>>
>        BLK
>        size <integer_cst 0xf7d5d070 constant 320>
>        unit size <integer_cst 0xf7dde2a0 constant 40>
>        align 8 symtab 0 alias set -1 canonical type 0xf7e1b360
>        domain <integer_type 0xf7de12a0
>                type <integer_type 0xf7d51000 unsigned int>
>            SI
>            size <integer_cst 0xf7d3d78c constant 32>
>            unit size <integer_cst 0xf7d3d578 constant 4>
>            align 32 symtab 0 alias set -1 canonical type 0xf7de12a0
>            precision 32 min <integer_cst 0xf7d3d594 0>
>            max <integer_cst 0xf7dde284 39>>
>        pointer_to_this <pointer_type 0xf7e1b480>>
>    addressable used ignored BLK file pr43513.c line 4 col 6
>    size <integer_cst 0xf7d5d070 320> unit size <integer_cst 0xf7dde2a0 40>
>    user align 32 context <function_decl 0xf7dfd480 foo3>>
>
>
>>>
>>>>> So should we try to find the base type of the vla, and use that, or use the
>>>>> nonstandard char type?
>>>>
>>>> I don't think we can reliably find the base type of the vla - well,
>>>> in practice we may because we control how we lower VLAs during
>>>> gimplification, but nothing in the IL constraints say that the
>>>> resulting pointer type should be special.
>>>>
>>>> Using a char[] decl shouldn't be a problem IMHO.
>>>>
>>>>>>>> And obviously you lose the optimization we arrange with inserting
>>>>>>>> __builtin_stack_save/restore pairs that way - stack space will no
>>>>>>>> longer be shared for subsequent VLAs.  Which means that you'd
>>>>>>>> better limit the size you allow this promotion.
>>>>>>>>
>>>>>>>
>>>>>>> Right, I could introduce a parameter for this.
>>>>>>
>>>>>> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
>>>>>> allow a size of PARAM_LARGE_STACK_FRAME / 10?
>>>>>>
>>>>>
>>>>> That unfortunately is too small for the example from bug report. The default
>>>>> value of the param is 250, so that would be a threshold of 25, and the alloca
>>>>> size of the example is 40.  Perhaps we can try a threshold of
>>>>> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?
>>>>
>>>> Hm.  estimated_stack_size is not O(1), so no.  I think we need to
>>>> find a sensible way of allowing stack sharing.  Eventually Michas
>>>> patch for introducing points-of-death would help here, if we'd
>>>> go for folding this during stack-save/restore optimization.
>>>>
>>>
>>> I changed the heuristics to this:
>>>
>>> +  /* Heuristic: don't fold large vlas.  */
>>> +  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
>>> +  /* In case a vla is declared at function scope, it has the same lifetime as a
>>> +     declared array, so we allow a larger size.  */
>>> +  block = gimple_block (stmt);
>>> +  if (!(cfun->after_inlining
>>> +        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
>>> +    threshold /= 10;
>>> +  if (size > threshold)
>>> +    return NULL_TREE;
>>>
>>> The heuristics distinguishes between before and after inlining.
>>>
>>> After inlining, vla's declared at function scope have the same lifetimes as
>>> declared arrays, and don't share their space. There should be no negative
>>> effects from folding an alloca in this case, but for safety we set a threshold
>>> of PARAM_LARGE_STACK_FRAME.
>>>
>>> Before inlining, such a vla might be inlined and share its space with another
>>> vla, so we stick with the normal threshold before inlining.
>>
>> That sounds reasonable, though the block check should probably use the
>> original VLA decl block, not that of the basic-block of the allocation,
>> but unfortunately we don't have access to that.  So I suppose using
>> the allocation basic-block BLOCK is good enough (still we don't
>> really care about BLOCK boundaries when doing CFG manipulations, so
>> the allocation bbs block may be not the outermost scope in more cases
>> than necessary).
>>
>>> However, using this heuristic we still don't generate optimal code.
>>>
>>> During the first pass_ccp, the folding is not done, because the size (40) is
>>> larger than the threshold 25. The threshold is 25, because inlining is not yet done.
>>>
>>> During pass_fold_builtins, the folding is done because it's after inlining, but
>>> it's later than pass_iv_optimize, so that still doesn't yield the optimal size
>>> of 64.
>>>
>>> The folding is not done during any of the other invocations or pass_ccp, because
>>> the argument has already become constant in the earlier invocation.
>>
>> Yeah, that's the issue with relying on folding to do this transformation.
>>
>>> Using this change, I manage to trigger folding during the second invocation of
>>> pass_ccp, before iv_optimize so we generate optimal code.
>>>
>>> Index: gcc/tree-ssa-ccp.c
>>> ===================================================================
>>> --- gcc/tree-ssa-ccp.c (revision 173734)
>>> +++ gcc/tree-ssa-ccp.c (working copy)
>>> @@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
>>>      if (gimple_call_internal_p (stmt))
>>>        return false;
>>>
>>> +        /* The heuristic of fold_builtin_alloca differs before and after
>>> +           inlining, so we don't require the arg to be changed into a constant
>>> +           for folding, but just to be constant.  */
>>> +        if (gimple_call_alloca_for_var_p (stmt)
>>> +            && get_constant_value (gimple_call_arg (stmt, 0)))
>>
>> Probably reverse the get_constant_value check and the transformation
>
> Done.
>
>> (gimple_call_alloca_for_var_p isn't a predicate as it has side-effects,
>> so its name should be changed).
>>
>>> +          return true;
>>> +
>>>      /* Propagate into the call arguments.  Compared to replace_uses_in
>>>         this can use the argument slot types for type verification
>>>         instead of the current argument type.  We also can safely
>>>
>>> But, to me it feels like a hack. Do you have any ideas how to do this better?
>>
>> It's somewhat of a hack, but at least it is more of a defined place
>> for this transformation - which then suggests to remove it from
>> generic folding and only keep calling it from CCP this way.
>>
>
> Done.
>
>> Richard.
>>
>>> Attaching untested patch for reference (will test overnight).
>>>
>>> Thanks,
>>> - Tom
>>>
>>> 2011-07-28  Tom de Vries  <tom@codesourcery.com>
>>>
>>>      PR middle-end/43513
>>>      * gimple-fold.c (params.h): Include.
>>>      (fold_builtin_alloca): New function.
>>>      (gimple_fold_builtin): Use fold_builtin_alloca.
>>>      * tree-ssa-ccp.c (ccp_fold_stmt): Force folding of vla-related alloca.
>>>      * Makefile.in (gimple-fold.o): Add $(PARAMS_H) to rule.
>
> Thanks,
> - Tom
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-28  9:44           ` Richard Guenther
@ 2011-07-29 12:09             ` Tom de Vries
  2011-07-28 16:00               ` Richard Guenther
  0 siblings, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-07-29 12:09 UTC (permalink / raw)
  To: Richard Guenther; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 7456 bytes --]

On 07/28/2011 12:22 PM, Richard Guenther wrote:
> On Wed, 27 Jul 2011, Tom de Vries wrote:
> 
>> On 07/27/2011 05:27 PM, Richard Guenther wrote:
>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>
>>>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>
>>>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>>>>>> Hi Richard,
>>>>>>>
>>>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>>>>>
>>>>>>> 01_pr43513.3.patch
>>>>>>> 02_pr43513.3.test.patch
>>>>>>> 03_pr43513.3.mudflap.patch
>>>>>>>
>>>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>>>>>
>>>>>>> I will sent out the patches individually.
>>>>>>>
>>>>>>
>>>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>>>>>> array declaration.
>>>>>>
>>>>>> OK for trunk?
>>>>>
>>>>> I don't think it is safe to try to get at the VLA type the way you do.
>>>>
>>>> I don't understand in what way it's not safe. Do you mean I don't manage to find
>>>> the type always, or that I find the wrong type, or something else?
>>>
>>> I think you might get the wrong type,
>>
>> Ok, I'll review that code one more time.
>>
>>> you also do not transform code
>>> like
>>>
>>>   int *p = alloca(4);
>>>   *p = 3;
>>>
>>> as there is no array type involved here.
>>>
>>
>> I was trying to stay away from non-vla allocas.  A source declared alloca has
>> function livetime, so we could have a single alloca in a loop, called 10 times,
>> with all 10 instances live at the same time. This patch does not detect such
>> cases, and thus stays away from non-vla allocas. A vla decl does not have such
>> problems, the lifetime ends when it goes out of scope.
> 
> Yes indeed - that probably would require more detailed analysis.
> 
>>>>> In fact I would simply do sth like
>>>>>
>>>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>>>   var = create_tmp_var (array_type, NULL);
>>>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
>>>>>
>>>>
>>>> I tried this code on the example, and it works, but the newly declared type has
>>>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
>>>> the memory access in the example potentially unaligned, which prohibits an
>>>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
>>>> with my current patch.
>>>
>>> Ok, so then set DECL_ALIGN of the variable to something reasonable
>>> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
>>> alignment that the targets alloca function would guarantee.
>>>
>>
>> I tried that, but that doesn't help. It's the alignment of the type that
>> matters, not of the decl.
> 
> It shouldn't.  All accesses are performed with the original types and
> alignment comes from that (plus the underlying decl).
> 

I managed to get it all working by using build_aligned_type rather that DECL_ALIGN.

>> So should we try to find the base type of the vla, and use that, or use the
>> nonstandard char type?
> 
> I don't think we can reliably find the base type of the vla - well,
> in practice we may because we control how we lower VLAs during
> gimplification, but nothing in the IL constraints say that the
> resulting pointer type should be special.
> 
> Using a char[] decl shouldn't be a problem IMHO.
> 
>>>>> And obviously you lose the optimization we arrange with inserting
>>>>> __builtin_stack_save/restore pairs that way - stack space will no
>>>>> longer be shared for subsequent VLAs.  Which means that you'd
>>>>> better limit the size you allow this promotion.
>>>>>
>>>>
>>>> Right, I could introduce a parameter for this.
>>>
>>> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
>>> allow a size of PARAM_LARGE_STACK_FRAME / 10?
>>>
>>
>> That unfortunately is too small for the example from bug report. The default
>> value of the param is 250, so that would be a threshold of 25, and the alloca
>> size of the example is 40.  Perhaps we can try a threshold of
>> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?
> 
> Hm.  estimated_stack_size is not O(1), so no.  I think we need to
> find a sensible way of allowing stack sharing.  Eventually Michas
> patch for introducing points-of-death would help here, if we'd
> go for folding this during stack-save/restore optimization.
> 

I changed the heuristics to this:

+  /* Heuristic: don't fold large vlas.  */
+  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
+  /* In case a vla is declared at function scope, it has the same lifetime as a
+     declared array, so we allow a larger size.  */
+  block = gimple_block (stmt);
+  if (!(cfun->after_inlining
+        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
+    threshold /= 10;
+  if (size > threshold)
+    return NULL_TREE;

The heuristics distinguishes between before and after inlining.

After inlining, vla's declared at function scope have the same lifetimes as
declared arrays, and don't share their space. There should be no negative
effects from folding an alloca in this case, but for safety we set a threshold
of PARAM_LARGE_STACK_FRAME.

Before inlining, such a vla might be inlined and share its space with another
vla, so we stick with the normal threshold before inlining.

However, using this heuristic we still don't generate optimal code.

During the first pass_ccp, the folding is not done, because the size (40) is
larger than the threshold 25. The threshold is 25, because inlining is not yet done.

During pass_fold_builtins, the folding is done because it's after inlining, but
it's later than pass_iv_optimize, so that still doesn't yield the optimal size
of 64.

The folding is not done during any of the other invocations or pass_ccp, because
the argument has already become constant in the earlier invocation.

Using this change, I manage to trigger folding during the second invocation of
pass_ccp, before iv_optimize so we generate optimal code.

Index: gcc/tree-ssa-ccp.c
===================================================================
--- gcc/tree-ssa-ccp.c (revision 173734)
+++ gcc/tree-ssa-ccp.c (working copy)
@@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
 	if (gimple_call_internal_p (stmt))
 	  return false;

+        /* The heuristic of fold_builtin_alloca differs before and after
+           inlining, so we don't require the arg to be changed into a constant
+           for folding, but just to be constant.  */
+        if (gimple_call_alloca_for_var_p (stmt)
+            && get_constant_value (gimple_call_arg (stmt, 0)))
+          return true;
+
 	/* Propagate into the call arguments.  Compared to replace_uses_in
 	   this can use the argument slot types for type verification
 	   instead of the current argument type.  We also can safely

But, to me it feels like a hack. Do you have any ideas how to do this better?

Attaching untested patch for reference (will test overnight).

Thanks,
- Tom

2011-07-28  Tom de Vries  <tom@codesourcery.com>

	PR middle-end/43513
	* gimple-fold.c (params.h): Include.
	(fold_builtin_alloca): New function.
	(gimple_fold_builtin): Use fold_builtin_alloca.
	* tree-ssa-ccp.c (ccp_fold_stmt): Force folding of vla-related alloca.
	* Makefile.in (gimple-fold.o): Add $(PARAMS_H) to rule.

[-- Attachment #2: 01_pr43513.4.patch --]
[-- Type: text/x-patch, Size: 4297 bytes --]

Index: gcc/gimple-fold.c
===================================================================
--- gcc/gimple-fold.c (revision 173734)
+++ gcc/gimple-fold.c (working copy)
@@ -31,6 +31,7 @@ along with GCC; see the file COPYING3.  
 #include "tree-ssa-propagate.h"
 #include "target.h"
 #include "gimple-fold.h"
+#include "params.h"
 
 /* Return true when DECL can be referenced from current unit.
    We can get declarations that are not possible to reference for
@@ -1170,6 +1171,54 @@ get_maxval_strlen (tree arg, tree *lengt
     }
 }
 
+/* Detects a vla-related alloca with a constant argument.  Declares fixed-size
+   array and return the address, if found, otherwise returns NULL_TREE.  */
+
+static tree
+fold_builtin_alloca (gimple stmt)
+{
+  unsigned HOST_WIDE_INT size, threshold, n_elem;
+  tree lhs, arg, block, var, elem_type, array_type;
+  unsigned int align;
+
+  /* Get lhs.  */
+  lhs = gimple_call_lhs (stmt);
+  if (lhs == NULL_TREE)
+    return NULL_TREE;
+
+  /* Only handle vla-related allocas for the moment.  We do not yet detect when
+     a source-level alloca can be safely folded.  */
+  if (!gimple_call_alloca_for_var_p (stmt))
+    return NULL_TREE;
+
+  /* Detect constant argument.  */
+  arg = gimple_call_arg (stmt, 0);
+  if (TREE_CODE (arg) != INTEGER_CST || !host_integerp (arg, 1))
+    return NULL_TREE;
+  size = TREE_INT_CST_LOW (arg);
+
+  /* Heuristic: don't fold large vlas.  */
+  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
+  /* In case a vla is declared at function scope, it has the same lifetime as a
+     declared array, so we allow a larger size.  */
+  block = gimple_block (stmt);
+  if (!(cfun->after_inlining
+        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
+    threshold /= 10;
+  if (size > threshold)
+    return NULL_TREE;
+
+  /* Declare array.  */
+  elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
+  n_elem = size * 8 / BITS_PER_UNIT;
+  align = MIN (size * 8, GET_MODE_PRECISION (word_mode));
+  array_type = build_aligned_type (build_array_type_nelts (elem_type, n_elem),
+                                   align);
+  var = create_tmp_var (array_type, NULL);
+
+  /* Fold alloca to the address of the array.  */
+  return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
+}
 
 /* Fold builtin call in statement STMT.  Returns a simplified tree.
    We may return a non-constant expression, including another call
@@ -1246,6 +1295,8 @@ gimple_fold_builtin (gimple stmt)
       arg_idx = 1;
       type = 2;
       break;
+    case BUILT_IN_ALLOCA:
+      return fold_builtin_alloca (stmt);
     default:
       return NULL_TREE;
     }
Index: gcc/tree-ssa-ccp.c
===================================================================
--- gcc/tree-ssa-ccp.c (revision 173734)
+++ gcc/tree-ssa-ccp.c (working copy)
@@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
 	if (gimple_call_internal_p (stmt))
 	  return false;
 
+        /* The heuristic of fold_builtin_alloca differs before and after
+           inlining, so we don't require the arg to be changed into a constant
+           for folding, but just to be constant.  */
+        if (gimple_call_alloca_for_var_p (stmt)
+            && get_constant_value (gimple_call_arg (stmt, 0)))
+          return true;
+
 	/* Propagate into the call arguments.  Compared to replace_uses_in
 	   this can use the argument slot types for type verification
 	   instead of the current argument type.  We also can safely
Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in (revision 173734)
+++ gcc/Makefile.in (working copy)
@@ -2672,7 +2672,7 @@ gimple-iterator.o : gimple-iterator.c $(
 gimple-fold.o : gimple-fold.c $(TREE_FLOW_H) $(CONFIG_H) \
    $(SYSTEM_H) $(RTL_H) $(TREE_H) $(TM_P_H) $(EXPR_H) $(GGC_H) output.h \
    $(DIAGNOSTIC_H) $(FUNCTION_H) $(TIMEVAR_H) $(TM_H) coretypes.h \
-   $(TREE_DUMP_H) $(BASIC_BLOCK_H) $(TREE_PASS_H) langhooks.h \
+   $(TREE_DUMP_H) $(BASIC_BLOCK_H) $(TREE_PASS_H) langhooks.h $(PARAMS_H) \
    tree-ssa-propagate.h value-prof.h $(FLAGS_H) $(TARGET_H) gimple-fold.h
 gimple-low.o : gimple-low.c $(CONFIG_H) $(SYSTEM_H) $(TREE_H) \
    $(DIAGNOSTIC_H) $(GIMPLE_H) $(TREE_INLINE_H) langhooks.h \

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-28 18:09                 ` Tom de Vries
  2011-07-28 19:38                   ` Richard Guenther
@ 2011-07-30  9:01                   ` Tom de Vries
  2011-07-30  9:23                     ` Tom de Vries
                                       ` (5 more replies)
  1 sibling, 6 replies; 37+ messages in thread
From: Tom de Vries @ 2011-07-30  9:01 UTC (permalink / raw)
  To: Richard Guenther; +Cc: gcc-patches, Zdenek Dvorak

[-- Attachment #1: Type: text/plain, Size: 12042 bytes --]

Hi,

On 07/28/2011 08:20 PM, Tom de Vries wrote:
> On 07/28/2011 06:25 PM, Richard Guenther wrote:
>> On Thu, 28 Jul 2011, Tom de Vries wrote:
>>
>>> On 07/28/2011 12:22 PM, Richard Guenther wrote:
>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>
>>>>> On 07/27/2011 05:27 PM, Richard Guenther wrote:
>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>
>>>>>>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
>>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>>
>>>>>>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>>>>>>>>> Hi Richard,
>>>>>>>>>>
>>>>>>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>>>>>>>>
>>>>>>>>>> 01_pr43513.3.patch
>>>>>>>>>> 02_pr43513.3.test.patch
>>>>>>>>>> 03_pr43513.3.mudflap.patch
>>>>>>>>>>
>>>>>>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>>>>>>>>
>>>>>>>>>> I will sent out the patches individually.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>>>>>>>>> array declaration.
>>>>>>>>>
>>>>>>>>> OK for trunk?
>>>>>>>>
>>>>>>>> I don't think it is safe to try to get at the VLA type the way you do.
>>>>>>>
>>>>>>> I don't understand in what way it's not safe. Do you mean I don't manage to find
>>>>>>> the type always, or that I find the wrong type, or something else?
>>>>>>
>>>>>> I think you might get the wrong type,
>>>>>
>>>>> Ok, I'll review that code one more time.
>>>>>
>>>>>> you also do not transform code
>>>>>> like
>>>>>>
>>>>>>   int *p = alloca(4);
>>>>>>   *p = 3;
>>>>>>
>>>>>> as there is no array type involved here.
>>>>>>
>>>>>
>>>>> I was trying to stay away from non-vla allocas.  A source declared alloca has
>>>>> function livetime, so we could have a single alloca in a loop, called 10 times,
>>>>> with all 10 instances live at the same time. This patch does not detect such
>>>>> cases, and thus stays away from non-vla allocas. A vla decl does not have such
>>>>> problems, the lifetime ends when it goes out of scope.
>>>>
>>>> Yes indeed - that probably would require more detailed analysis.
>>>>
>>>>>>>> In fact I would simply do sth like
>>>>>>>>
>>>>>>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>>>>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>>>>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>>>>>>   var = create_tmp_var (array_type, NULL);
>>>>>>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
>>>>>>>>
>>>>>>>
>>>>>>> I tried this code on the example, and it works, but the newly declared type has
>>>>>>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
>>>>>>> the memory access in the example potentially unaligned, which prohibits an
>>>>>>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
>>>>>>> with my current patch.
>>>>>>
>>>>>> Ok, so then set DECL_ALIGN of the variable to something reasonable
>>>>>> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
>>>>>> alignment that the targets alloca function would guarantee.
>>>>>>
>>>>>
>>>>> I tried that, but that doesn't help. It's the alignment of the type that
>>>>> matters, not of the decl.
>>>>
>>>> It shouldn't.  All accesses are performed with the original types and
>>>> alignment comes from that (plus the underlying decl).
>>>>
>>>
>>> I managed to get it all working by using build_aligned_type rather that DECL_ALIGN.
>>
>> That's really odd, DECL_ALIGN should just work - nothing refers to the
>> type of the decl in the IL.  Can you try also setting DECL_USER_ALIGN to 
>> 1 maybe?
>>
> 
> This doesn't work either.
> 
>   /* Declare array.  */
>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>   n_elem = size * 8 / BITS_PER_UNIT;
>   align = MIN (size * 8, GET_MODE_PRECISION (word_mode));
>   array_type = build_array_type_nelts (elem_type, n_elem);
>   var = create_tmp_var (array_type, NULL);
>   DECL_ALIGN (var) = align;
>   DECL_USER_ALIGN (var) = 1;
> 
> Maybe this clarifies it:
> 
> Breakpoint 1, may_be_unaligned_p (ref=0xf7d9d410, step=0xf7d3d578) at
> /home/vries/local/google/src/gcc-mainline/gcc/tree-ssa-loop-ivopts.c:1621
> (gdb) call debug_generic_expr (ref)
> MEM[(int[0:D.2579] *)&D.2595][0]
> (gdb) call debug_generic_expr (step)
> 4
> 
> 1627	  base = get_inner_reference (ref, &bitsize, &bitpos, &toffset, &mode,
> (gdb) call debug_generic_expr (base)
> D.2595
> 
> 1629	  base_type = TREE_TYPE (base);
> (gdb) call debug_generic_expr (base_type)
> <unnamed-unsigned:8>[40]
> 
> 1630	  base_align = TYPE_ALIGN (base_type);
> (gdb) p base_align
> $1 = 8
> 
> So the align is 8-bits, and we return true here:
> 
> (gdb) n
> 1632	  if (mode != BLKmode)
> (gdb) n
> 1634	      unsigned mode_align = GET_MODE_ALIGNMENT (mode);
> (gdb)
> 1636	      if (base_align < mode_align
> (gdb)
> 1639		return true;
> 
> 
> Here we can see that the base actually has the (user) align on it:
> 
> (gdb) call debug_tree (base)
>  <var_decl 0xf7e1b420 D.2595
>     type <array_type 0xf7e1b360
>         type <integer_type 0xf7e1b2a0 public unsigned QI
>             size <integer_cst 0xf7d3d604 constant 8>
>             unit size <integer_cst 0xf7d3d620 constant 1>
>             align 8 symtab 0 alias set -1 canonical type 0xf7e1b2a0 precision 8
>             min <integer_cst 0xf7dffaf0 0> max <integer_cst 0xf7dffb0c 255>
>             pointer_to_this <pointer_type 0xf7e1b3c0>>
>         BLK
>         size <integer_cst 0xf7d5d070 constant 320>
>         unit size <integer_cst 0xf7dde2a0 constant 40>
>         align 8 symtab 0 alias set -1 canonical type 0xf7e1b360
>         domain <integer_type 0xf7de12a0
>                 type <integer_type 0xf7d51000 unsigned int>
>             SI
>             size <integer_cst 0xf7d3d78c constant 32>
>             unit size <integer_cst 0xf7d3d578 constant 4>
>             align 32 symtab 0 alias set -1 canonical type 0xf7de12a0
>             precision 32 min <integer_cst 0xf7d3d594 0>
>             max <integer_cst 0xf7dde284 39>>
>         pointer_to_this <pointer_type 0xf7e1b480>>
>     addressable used ignored BLK file pr43513.c line 4 col 6
>     size <integer_cst 0xf7d5d070 320> unit size <integer_cst 0xf7dde2a0 40>
>     user align 32 context <function_decl 0xf7dfd480 foo3>>
> 
> 
>>>
>>>>> So should we try to find the base type of the vla, and use that, or use the
>>>>> nonstandard char type?
>>>>
>>>> I don't think we can reliably find the base type of the vla - well,
>>>> in practice we may because we control how we lower VLAs during
>>>> gimplification, but nothing in the IL constraints say that the
>>>> resulting pointer type should be special.
>>>>
>>>> Using a char[] decl shouldn't be a problem IMHO.
>>>>
>>>>>>>> And obviously you lose the optimization we arrange with inserting
>>>>>>>> __builtin_stack_save/restore pairs that way - stack space will no
>>>>>>>> longer be shared for subsequent VLAs.  Which means that you'd
>>>>>>>> better limit the size you allow this promotion.
>>>>>>>>
>>>>>>>
>>>>>>> Right, I could introduce a parameter for this.
>>>>>>
>>>>>> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
>>>>>> allow a size of PARAM_LARGE_STACK_FRAME / 10?
>>>>>>
>>>>>
>>>>> That unfortunately is too small for the example from bug report. The default
>>>>> value of the param is 250, so that would be a threshold of 25, and the alloca
>>>>> size of the example is 40.  Perhaps we can try a threshold of
>>>>> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?
>>>>
>>>> Hm.  estimated_stack_size is not O(1), so no.  I think we need to
>>>> find a sensible way of allowing stack sharing.  Eventually Michas
>>>> patch for introducing points-of-death would help here, if we'd
>>>> go for folding this during stack-save/restore optimization.
>>>>
>>>
>>> I changed the heuristics to this:
>>>
>>> +  /* Heuristic: don't fold large vlas.  */
>>> +  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
>>> +  /* In case a vla is declared at function scope, it has the same lifetime as a
>>> +     declared array, so we allow a larger size.  */
>>> +  block = gimple_block (stmt);
>>> +  if (!(cfun->after_inlining
>>> +        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
>>> +    threshold /= 10;
>>> +  if (size > threshold)
>>> +    return NULL_TREE;
>>>
>>> The heuristics distinguishes between before and after inlining.
>>>
>>> After inlining, vla's declared at function scope have the same lifetimes as
>>> declared arrays, and don't share their space. There should be no negative
>>> effects from folding an alloca in this case, but for safety we set a threshold
>>> of PARAM_LARGE_STACK_FRAME.
>>>
>>> Before inlining, such a vla might be inlined and share its space with another
>>> vla, so we stick with the normal threshold before inlining.
>>
>> That sounds reasonable, though the block check should probably use the
>> original VLA decl block, not that of the basic-block of the allocation,
>> but unfortunately we don't have access to that.  So I suppose using
>> the allocation basic-block BLOCK is good enough (still we don't
>> really care about BLOCK boundaries when doing CFG manipulations, so
>> the allocation bbs block may be not the outermost scope in more cases
>> than necessary).
>>
>>> However, using this heuristic we still don't generate optimal code.
>>>
>>> During the first pass_ccp, the folding is not done, because the size (40) is
>>> larger than the threshold 25. The threshold is 25, because inlining is not yet done.
>>>
>>> During pass_fold_builtins, the folding is done because it's after inlining, but
>>> it's later than pass_iv_optimize, so that still doesn't yield the optimal size
>>> of 64.
>>>
>>> The folding is not done during any of the other invocations or pass_ccp, because
>>> the argument has already become constant in the earlier invocation.
>>
>> Yeah, that's the issue with relying on folding to do this transformation.
>>
>>> Using this change, I manage to trigger folding during the second invocation of
>>> pass_ccp, before iv_optimize so we generate optimal code.
>>>
>>> Index: gcc/tree-ssa-ccp.c
>>> ===================================================================
>>> --- gcc/tree-ssa-ccp.c (revision 173734)
>>> +++ gcc/tree-ssa-ccp.c (working copy)
>>> @@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
>>>  	if (gimple_call_internal_p (stmt))
>>>  	  return false;
>>>
>>> +        /* The heuristic of fold_builtin_alloca differs before and after
>>> +           inlining, so we don't require the arg to be changed into a constant
>>> +           for folding, but just to be constant.  */
>>> +        if (gimple_call_alloca_for_var_p (stmt)
>>> +            && get_constant_value (gimple_call_arg (stmt, 0)))
>>
>> Probably reverse the get_constant_value check and the transformation
> 
> Done.
> 
>> (gimple_call_alloca_for_var_p isn't a predicate as it has side-effects,
>> so its name should be changed).
>>
>>> +          return true;
>>> +
>>>  	/* Propagate into the call arguments.  Compared to replace_uses_in
>>>  	   this can use the argument slot types for type verification
>>>  	   instead of the current argument type.  We also can safely
>>>
>>> But, to me it feels like a hack. Do you have any ideas how to do this better?
>>
>> It's somewhat of a hack, but at least it is more of a defined place
>> for this transformation - which then suggests to remove it from
>> generic folding and only keep calling it from CCP this way.
>>
> 
> Done.
> 

This is an updated version of the patch. I have 2 new patches and an updated
testcase which I will sent out individually.

Patch set was bootstrapped and reg-tested on x86_64.

Ok for trunk?

Thanks,
- Tom

2011-07-30  Tom de Vries  <tom@codesourcery.com>

	PR middle-end/43513
	* Makefile.in (tree-ssa-ccp.o): Add $(PARAMS_H) to rule.
	* tree-ssa-ccp.c (params.h): Include.
	(fold_builtin_alloca_for_var): New function.
	(ccp_fold_stmt): Use fold_builtin_alloca_for_var.

[-- Attachment #2: pr43513.4.patch --]
[-- Type: text/x-patch, Size: 3832 bytes --]

Index: gcc/tree-ssa-ccp.c
===================================================================
--- gcc/tree-ssa-ccp.c (revision 173734)
+++ gcc/tree-ssa-ccp.c (working copy)
@@ -133,6 +133,7 @@ along with GCC; see the file COPYING3.  
 #include "diagnostic-core.h"
 #include "dbgcnt.h"
 #include "gimple-fold.h"
+#include "params.h"
 
 
 /* Possible lattice values.  */
@@ -1659,6 +1660,51 @@ evaluate_stmt (gimple stmt)
   return val;
 }
 
+/* Detects a vla-related alloca with a constant argument.  Declares fixed-size
+   array and return the address, if found, otherwise returns NULL_TREE.  */
+
+static tree
+fold_builtin_alloca_for_var (gimple stmt)
+{
+  unsigned HOST_WIDE_INT size, threshold, n_elem;
+  tree lhs, arg, block, var, elem_type, array_type;
+  unsigned int align;
+
+  /* Get lhs.  */
+  lhs = gimple_call_lhs (stmt);
+  if (lhs == NULL_TREE)
+    return NULL_TREE;
+
+  /* Detect constant argument.  */
+  arg = get_constant_value (gimple_call_arg (stmt, 0));
+  if (arg == NULL_TREE || TREE_CODE (arg) != INTEGER_CST
+      || !host_integerp (arg, 1))
+    return NULL_TREE;
+  size = TREE_INT_CST_LOW (arg);
+
+  /* Heuristic: don't fold large vlas.  */
+  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
+  /* In case a vla is declared at function scope, it has the same lifetime as a
+     declared array, so we allow a larger size.  */
+  block = gimple_block (stmt);
+  if (!(cfun->after_inlining
+        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
+    threshold /= 10;
+  if (size > threshold)
+    return NULL_TREE;
+
+  /* Declare array.  */
+  elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
+  n_elem = size * 8 / BITS_PER_UNIT;
+  align = MIN (size * 8, GET_MODE_PRECISION (word_mode));
+  array_type = build_array_type_nelts (elem_type, n_elem);
+  var = create_tmp_var (array_type, NULL);
+  DECL_ALIGN (var) = align;
+
+  /* Fold alloca to the address of the array.  */
+  return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
+}
+
 /* Fold the stmt at *GSI with CCP specific information that propagating
    and regular folding does not catch.  */
 
@@ -1727,6 +1773,20 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
 	if (gimple_call_internal_p (stmt))
 	  return false;
 
+        /* The heuristic of fold_builtin_alloca_for_var differs before and after
+           inlining, so we don't require the arg to be changed into a constant
+           for folding, but just to be constant.  */
+        if (gimple_call_alloca_for_var_p (stmt))
+          {
+            tree new_rhs = fold_builtin_alloca_for_var (stmt);
+            bool res;
+            if (new_rhs == NULL_TREE)
+              return false;
+            res = update_call_from_tree (gsi, new_rhs);
+            gcc_assert (res);
+            return true;
+          }
+
 	/* Propagate into the call arguments.  Compared to replace_uses_in
 	   this can use the argument slot types for type verification
 	   instead of the current argument type.  We also can safely
Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in (revision 173734)
+++ gcc/Makefile.in (working copy)
@@ -3157,7 +3157,7 @@ tree-call-cdce.o : tree-call-cdce.c $(CO
 tree-ssa-ccp.o : tree-ssa-ccp.c $(TREE_FLOW_H) $(CONFIG_H) \
    $(SYSTEM_H) $(TREE_H) $(TM_P_H) $(EXPR_H) output.h \
    $(DIAGNOSTIC_H) $(FUNCTION_H) $(TIMEVAR_H) $(TM_H) coretypes.h \
-   $(TREE_DUMP_H) $(BASIC_BLOCK_H) $(TREE_PASS_H) langhooks.h \
+   $(TREE_DUMP_H) $(BASIC_BLOCK_H) $(TREE_PASS_H) langhooks.h  $(PARAMS_H) \
    tree-ssa-propagate.h value-prof.h $(FLAGS_H) $(TARGET_H) $(DIAGNOSTIC_CORE_H) \
    $(DBGCNT_H) tree-pretty-print.h gimple-pretty-print.h gimple-fold.h
 tree-sra.o : tree-sra.c $(CONFIG_H) $(SYSTEM_H) coretypes.h alloc-pool.h \

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-30  9:01                   ` Tom de Vries
@ 2011-07-30  9:23                     ` Tom de Vries
  2011-07-30 14:22                       ` Richard Guenther
  2011-07-30  9:27                     ` Tom de Vries
                                       ` (4 subsequent siblings)
  5 siblings, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-07-30  9:23 UTC (permalink / raw)
  To: Richard Guenther; +Cc: gcc-patches, Zdenek Dvorak

[-- Attachment #1: Type: text/plain, Size: 12161 bytes --]

On 07/30/2011 10:21 AM, Tom de Vries wrote:
> Hi,
> 
> On 07/28/2011 08:20 PM, Tom de Vries wrote:
>> On 07/28/2011 06:25 PM, Richard Guenther wrote:
>>> On Thu, 28 Jul 2011, Tom de Vries wrote:
>>>
>>>> On 07/28/2011 12:22 PM, Richard Guenther wrote:
>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>
>>>>>> On 07/27/2011 05:27 PM, Richard Guenther wrote:
>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>
>>>>>>>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
>>>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>>>
>>>>>>>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>
>>>>>>>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>>>>>>>>>
>>>>>>>>>>> 01_pr43513.3.patch
>>>>>>>>>>> 02_pr43513.3.test.patch
>>>>>>>>>>> 03_pr43513.3.mudflap.patch
>>>>>>>>>>>
>>>>>>>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>>>>>>>>>
>>>>>>>>>>> I will sent out the patches individually.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>>>>>>>>>> array declaration.
>>>>>>>>>>
>>>>>>>>>> OK for trunk?
>>>>>>>>>
>>>>>>>>> I don't think it is safe to try to get at the VLA type the way you do.
>>>>>>>>
>>>>>>>> I don't understand in what way it's not safe. Do you mean I don't manage to find
>>>>>>>> the type always, or that I find the wrong type, or something else?
>>>>>>>
>>>>>>> I think you might get the wrong type,
>>>>>>
>>>>>> Ok, I'll review that code one more time.
>>>>>>
>>>>>>> you also do not transform code
>>>>>>> like
>>>>>>>
>>>>>>>   int *p = alloca(4);
>>>>>>>   *p = 3;
>>>>>>>
>>>>>>> as there is no array type involved here.
>>>>>>>
>>>>>>
>>>>>> I was trying to stay away from non-vla allocas.  A source declared alloca has
>>>>>> function livetime, so we could have a single alloca in a loop, called 10 times,
>>>>>> with all 10 instances live at the same time. This patch does not detect such
>>>>>> cases, and thus stays away from non-vla allocas. A vla decl does not have such
>>>>>> problems, the lifetime ends when it goes out of scope.
>>>>>
>>>>> Yes indeed - that probably would require more detailed analysis.
>>>>>
>>>>>>>>> In fact I would simply do sth like
>>>>>>>>>
>>>>>>>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>>>>>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>>>>>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>>>>>>>   var = create_tmp_var (array_type, NULL);
>>>>>>>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
>>>>>>>>>
>>>>>>>>
>>>>>>>> I tried this code on the example, and it works, but the newly declared type has
>>>>>>>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
>>>>>>>> the memory access in the example potentially unaligned, which prohibits an
>>>>>>>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
>>>>>>>> with my current patch.
>>>>>>>
>>>>>>> Ok, so then set DECL_ALIGN of the variable to something reasonable
>>>>>>> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
>>>>>>> alignment that the targets alloca function would guarantee.
>>>>>>>
>>>>>>
>>>>>> I tried that, but that doesn't help. It's the alignment of the type that
>>>>>> matters, not of the decl.
>>>>>
>>>>> It shouldn't.  All accesses are performed with the original types and
>>>>> alignment comes from that (plus the underlying decl).
>>>>>
>>>>
>>>> I managed to get it all working by using build_aligned_type rather that DECL_ALIGN.
>>>
>>> That's really odd, DECL_ALIGN should just work - nothing refers to the
>>> type of the decl in the IL.  Can you try also setting DECL_USER_ALIGN to 
>>> 1 maybe?
>>>
>>
>> This doesn't work either.
>>
>>   /* Declare array.  */
>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>   n_elem = size * 8 / BITS_PER_UNIT;
>>   align = MIN (size * 8, GET_MODE_PRECISION (word_mode));
>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>   var = create_tmp_var (array_type, NULL);
>>   DECL_ALIGN (var) = align;
>>   DECL_USER_ALIGN (var) = 1;
>>
>> Maybe this clarifies it:
>>
>> Breakpoint 1, may_be_unaligned_p (ref=0xf7d9d410, step=0xf7d3d578) at
>> /home/vries/local/google/src/gcc-mainline/gcc/tree-ssa-loop-ivopts.c:1621
>> (gdb) call debug_generic_expr (ref)
>> MEM[(int[0:D.2579] *)&D.2595][0]
>> (gdb) call debug_generic_expr (step)
>> 4
>>
>> 1627	  base = get_inner_reference (ref, &bitsize, &bitpos, &toffset, &mode,
>> (gdb) call debug_generic_expr (base)
>> D.2595
>>
>> 1629	  base_type = TREE_TYPE (base);
>> (gdb) call debug_generic_expr (base_type)
>> <unnamed-unsigned:8>[40]
>>
>> 1630	  base_align = TYPE_ALIGN (base_type);
>> (gdb) p base_align
>> $1 = 8
>>
>> So the align is 8-bits, and we return true here:
>>
>> (gdb) n
>> 1632	  if (mode != BLKmode)
>> (gdb) n
>> 1634	      unsigned mode_align = GET_MODE_ALIGNMENT (mode);
>> (gdb)
>> 1636	      if (base_align < mode_align
>> (gdb)
>> 1639		return true;
>>
>>
>> Here we can see that the base actually has the (user) align on it:
>>
>> (gdb) call debug_tree (base)
>>  <var_decl 0xf7e1b420 D.2595
>>     type <array_type 0xf7e1b360
>>         type <integer_type 0xf7e1b2a0 public unsigned QI
>>             size <integer_cst 0xf7d3d604 constant 8>
>>             unit size <integer_cst 0xf7d3d620 constant 1>
>>             align 8 symtab 0 alias set -1 canonical type 0xf7e1b2a0 precision 8
>>             min <integer_cst 0xf7dffaf0 0> max <integer_cst 0xf7dffb0c 255>
>>             pointer_to_this <pointer_type 0xf7e1b3c0>>
>>         BLK
>>         size <integer_cst 0xf7d5d070 constant 320>
>>         unit size <integer_cst 0xf7dde2a0 constant 40>
>>         align 8 symtab 0 alias set -1 canonical type 0xf7e1b360
>>         domain <integer_type 0xf7de12a0
>>                 type <integer_type 0xf7d51000 unsigned int>
>>             SI
>>             size <integer_cst 0xf7d3d78c constant 32>
>>             unit size <integer_cst 0xf7d3d578 constant 4>
>>             align 32 symtab 0 alias set -1 canonical type 0xf7de12a0
>>             precision 32 min <integer_cst 0xf7d3d594 0>
>>             max <integer_cst 0xf7dde284 39>>
>>         pointer_to_this <pointer_type 0xf7e1b480>>
>>     addressable used ignored BLK file pr43513.c line 4 col 6
>>     size <integer_cst 0xf7d5d070 320> unit size <integer_cst 0xf7dde2a0 40>
>>     user align 32 context <function_decl 0xf7dfd480 foo3>>
>>
>>
>>>>
>>>>>> So should we try to find the base type of the vla, and use that, or use the
>>>>>> nonstandard char type?
>>>>>
>>>>> I don't think we can reliably find the base type of the vla - well,
>>>>> in practice we may because we control how we lower VLAs during
>>>>> gimplification, but nothing in the IL constraints say that the
>>>>> resulting pointer type should be special.
>>>>>
>>>>> Using a char[] decl shouldn't be a problem IMHO.
>>>>>
>>>>>>>>> And obviously you lose the optimization we arrange with inserting
>>>>>>>>> __builtin_stack_save/restore pairs that way - stack space will no
>>>>>>>>> longer be shared for subsequent VLAs.  Which means that you'd
>>>>>>>>> better limit the size you allow this promotion.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Right, I could introduce a parameter for this.
>>>>>>>
>>>>>>> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
>>>>>>> allow a size of PARAM_LARGE_STACK_FRAME / 10?
>>>>>>>
>>>>>>
>>>>>> That unfortunately is too small for the example from bug report. The default
>>>>>> value of the param is 250, so that would be a threshold of 25, and the alloca
>>>>>> size of the example is 40.  Perhaps we can try a threshold of
>>>>>> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?
>>>>>
>>>>> Hm.  estimated_stack_size is not O(1), so no.  I think we need to
>>>>> find a sensible way of allowing stack sharing.  Eventually Michas
>>>>> patch for introducing points-of-death would help here, if we'd
>>>>> go for folding this during stack-save/restore optimization.
>>>>>
>>>>
>>>> I changed the heuristics to this:
>>>>
>>>> +  /* Heuristic: don't fold large vlas.  */
>>>> +  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
>>>> +  /* In case a vla is declared at function scope, it has the same lifetime as a
>>>> +     declared array, so we allow a larger size.  */
>>>> +  block = gimple_block (stmt);
>>>> +  if (!(cfun->after_inlining
>>>> +        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
>>>> +    threshold /= 10;
>>>> +  if (size > threshold)
>>>> +    return NULL_TREE;
>>>>
>>>> The heuristics distinguishes between before and after inlining.
>>>>
>>>> After inlining, vla's declared at function scope have the same lifetimes as
>>>> declared arrays, and don't share their space. There should be no negative
>>>> effects from folding an alloca in this case, but for safety we set a threshold
>>>> of PARAM_LARGE_STACK_FRAME.
>>>>
>>>> Before inlining, such a vla might be inlined and share its space with another
>>>> vla, so we stick with the normal threshold before inlining.
>>>
>>> That sounds reasonable, though the block check should probably use the
>>> original VLA decl block, not that of the basic-block of the allocation,
>>> but unfortunately we don't have access to that.  So I suppose using
>>> the allocation basic-block BLOCK is good enough (still we don't
>>> really care about BLOCK boundaries when doing CFG manipulations, so
>>> the allocation bbs block may be not the outermost scope in more cases
>>> than necessary).
>>>
>>>> However, using this heuristic we still don't generate optimal code.
>>>>
>>>> During the first pass_ccp, the folding is not done, because the size (40) is
>>>> larger than the threshold 25. The threshold is 25, because inlining is not yet done.
>>>>
>>>> During pass_fold_builtins, the folding is done because it's after inlining, but
>>>> it's later than pass_iv_optimize, so that still doesn't yield the optimal size
>>>> of 64.
>>>>
>>>> The folding is not done during any of the other invocations or pass_ccp, because
>>>> the argument has already become constant in the earlier invocation.
>>>
>>> Yeah, that's the issue with relying on folding to do this transformation.
>>>
>>>> Using this change, I manage to trigger folding during the second invocation of
>>>> pass_ccp, before iv_optimize so we generate optimal code.
>>>>
>>>> Index: gcc/tree-ssa-ccp.c
>>>> ===================================================================
>>>> --- gcc/tree-ssa-ccp.c (revision 173734)
>>>> +++ gcc/tree-ssa-ccp.c (working copy)
>>>> @@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
>>>>  	if (gimple_call_internal_p (stmt))
>>>>  	  return false;
>>>>
>>>> +        /* The heuristic of fold_builtin_alloca differs before and after
>>>> +           inlining, so we don't require the arg to be changed into a constant
>>>> +           for folding, but just to be constant.  */
>>>> +        if (gimple_call_alloca_for_var_p (stmt)
>>>> +            && get_constant_value (gimple_call_arg (stmt, 0)))
>>>
>>> Probably reverse the get_constant_value check and the transformation
>>
>> Done.
>>
>>> (gimple_call_alloca_for_var_p isn't a predicate as it has side-effects,
>>> so its name should be changed).
>>>
>>>> +          return true;
>>>> +
>>>>  	/* Propagate into the call arguments.  Compared to replace_uses_in
>>>>  	   this can use the argument slot types for type verification
>>>>  	   instead of the current argument type.  We also can safely
>>>>
>>>> But, to me it feels like a hack. Do you have any ideas how to do this better?
>>>
>>> It's somewhat of a hack, but at least it is more of a defined place
>>> for this transformation - which then suggests to remove it from
>>> generic folding and only keep calling it from CCP this way.
>>>
>>
>> Done.
>>
> 
> This is an updated version of the patch. I have 2 new patches and an updated
> testcase which I will sent out individually.
> 

2011-07-30  Tom de Vries  <tom@codesourcery.com>

	PR middle-end/43513
	* tree-ssa-loop-ivopts.c (may_be_unaligned_p): Use get_object_alignment.

[-- Attachment #2: pr43513.4.ivopts.patch --]
[-- Type: text/x-patch, Size: 759 bytes --]

Index: gcc/tree-ssa-loop-ivopts.c
===================================================================
--- gcc/tree-ssa-loop-ivopts.c (revision 173734)
+++ gcc/tree-ssa-loop-ivopts.c (working copy)
@@ -1608,7 +1608,6 @@ static bool
 may_be_unaligned_p (tree ref, tree step)
 {
   tree base;
-  tree base_type;
   HOST_WIDE_INT bitsize;
   HOST_WIDE_INT bitpos;
   tree toffset;
@@ -1626,8 +1625,7 @@ may_be_unaligned_p (tree ref, tree step)
      STRICT_ALIGNMENT is true.  */
   base = get_inner_reference (ref, &bitsize, &bitpos, &toffset, &mode,
 			      &unsignedp, &volatilep, true);
-  base_type = TREE_TYPE (base);
-  base_align = TYPE_ALIGN (base_type);
+  base_align = get_object_alignment (base, BIGGEST_ALIGNMENT);
 
   if (mode != BLKmode)
     {

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-30  9:01                   ` Tom de Vries
  2011-07-30  9:23                     ` Tom de Vries
@ 2011-07-30  9:27                     ` Tom de Vries
  2011-07-30 13:50                       ` Richard Guenther
  2011-08-07 12:00                     ` Tom de Vries
                                       ` (3 subsequent siblings)
  5 siblings, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-07-30  9:27 UTC (permalink / raw)
  To: Richard Guenther; +Cc: gcc-patches, Zdenek Dvorak

[-- Attachment #1: Type: text/plain, Size: 12656 bytes --]

On 07/30/2011 10:21 AM, Tom de Vries wrote:
> Hi,
> 
> On 07/28/2011 08:20 PM, Tom de Vries wrote:
>> On 07/28/2011 06:25 PM, Richard Guenther wrote:
>>> On Thu, 28 Jul 2011, Tom de Vries wrote:
>>>
>>>> On 07/28/2011 12:22 PM, Richard Guenther wrote:
>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>
>>>>>> On 07/27/2011 05:27 PM, Richard Guenther wrote:
>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>
>>>>>>>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
>>>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>>>
>>>>>>>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>
>>>>>>>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>>>>>>>>>
>>>>>>>>>>> 01_pr43513.3.patch
>>>>>>>>>>> 02_pr43513.3.test.patch
>>>>>>>>>>> 03_pr43513.3.mudflap.patch
>>>>>>>>>>>
>>>>>>>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>>>>>>>>>
>>>>>>>>>>> I will sent out the patches individually.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>>>>>>>>>> array declaration.
>>>>>>>>>>
>>>>>>>>>> OK for trunk?
>>>>>>>>>
>>>>>>>>> I don't think it is safe to try to get at the VLA type the way you do.
>>>>>>>>
>>>>>>>> I don't understand in what way it's not safe. Do you mean I don't manage to find
>>>>>>>> the type always, or that I find the wrong type, or something else?
>>>>>>>
>>>>>>> I think you might get the wrong type,
>>>>>>
>>>>>> Ok, I'll review that code one more time.
>>>>>>
>>>>>>> you also do not transform code
>>>>>>> like
>>>>>>>
>>>>>>>   int *p = alloca(4);
>>>>>>>   *p = 3;
>>>>>>>
>>>>>>> as there is no array type involved here.
>>>>>>>
>>>>>>
>>>>>> I was trying to stay away from non-vla allocas.  A source declared alloca has
>>>>>> function livetime, so we could have a single alloca in a loop, called 10 times,
>>>>>> with all 10 instances live at the same time. This patch does not detect such
>>>>>> cases, and thus stays away from non-vla allocas. A vla decl does not have such
>>>>>> problems, the lifetime ends when it goes out of scope.
>>>>>
>>>>> Yes indeed - that probably would require more detailed analysis.
>>>>>
>>>>>>>>> In fact I would simply do sth like
>>>>>>>>>
>>>>>>>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>>>>>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>>>>>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>>>>>>>   var = create_tmp_var (array_type, NULL);
>>>>>>>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
>>>>>>>>>
>>>>>>>>
>>>>>>>> I tried this code on the example, and it works, but the newly declared type has
>>>>>>>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
>>>>>>>> the memory access in the example potentially unaligned, which prohibits an
>>>>>>>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
>>>>>>>> with my current patch.
>>>>>>>
>>>>>>> Ok, so then set DECL_ALIGN of the variable to something reasonable
>>>>>>> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
>>>>>>> alignment that the targets alloca function would guarantee.
>>>>>>>
>>>>>>
>>>>>> I tried that, but that doesn't help. It's the alignment of the type that
>>>>>> matters, not of the decl.
>>>>>
>>>>> It shouldn't.  All accesses are performed with the original types and
>>>>> alignment comes from that (plus the underlying decl).
>>>>>
>>>>
>>>> I managed to get it all working by using build_aligned_type rather that DECL_ALIGN.
>>>
>>> That's really odd, DECL_ALIGN should just work - nothing refers to the
>>> type of the decl in the IL.  Can you try also setting DECL_USER_ALIGN to 
>>> 1 maybe?
>>>
>>
>> This doesn't work either.
>>
>>   /* Declare array.  */
>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>   n_elem = size * 8 / BITS_PER_UNIT;
>>   align = MIN (size * 8, GET_MODE_PRECISION (word_mode));
>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>   var = create_tmp_var (array_type, NULL);
>>   DECL_ALIGN (var) = align;
>>   DECL_USER_ALIGN (var) = 1;
>>
>> Maybe this clarifies it:
>>
>> Breakpoint 1, may_be_unaligned_p (ref=0xf7d9d410, step=0xf7d3d578) at
>> /home/vries/local/google/src/gcc-mainline/gcc/tree-ssa-loop-ivopts.c:1621
>> (gdb) call debug_generic_expr (ref)
>> MEM[(int[0:D.2579] *)&D.2595][0]
>> (gdb) call debug_generic_expr (step)
>> 4
>>
>> 1627	  base = get_inner_reference (ref, &bitsize, &bitpos, &toffset, &mode,
>> (gdb) call debug_generic_expr (base)
>> D.2595
>>
>> 1629	  base_type = TREE_TYPE (base);
>> (gdb) call debug_generic_expr (base_type)
>> <unnamed-unsigned:8>[40]
>>
>> 1630	  base_align = TYPE_ALIGN (base_type);
>> (gdb) p base_align
>> $1 = 8
>>
>> So the align is 8-bits, and we return true here:
>>
>> (gdb) n
>> 1632	  if (mode != BLKmode)
>> (gdb) n
>> 1634	      unsigned mode_align = GET_MODE_ALIGNMENT (mode);
>> (gdb)
>> 1636	      if (base_align < mode_align
>> (gdb)
>> 1639		return true;
>>
>>
>> Here we can see that the base actually has the (user) align on it:
>>
>> (gdb) call debug_tree (base)
>>  <var_decl 0xf7e1b420 D.2595
>>     type <array_type 0xf7e1b360
>>         type <integer_type 0xf7e1b2a0 public unsigned QI
>>             size <integer_cst 0xf7d3d604 constant 8>
>>             unit size <integer_cst 0xf7d3d620 constant 1>
>>             align 8 symtab 0 alias set -1 canonical type 0xf7e1b2a0 precision 8
>>             min <integer_cst 0xf7dffaf0 0> max <integer_cst 0xf7dffb0c 255>
>>             pointer_to_this <pointer_type 0xf7e1b3c0>>
>>         BLK
>>         size <integer_cst 0xf7d5d070 constant 320>
>>         unit size <integer_cst 0xf7dde2a0 constant 40>
>>         align 8 symtab 0 alias set -1 canonical type 0xf7e1b360
>>         domain <integer_type 0xf7de12a0
>>                 type <integer_type 0xf7d51000 unsigned int>
>>             SI
>>             size <integer_cst 0xf7d3d78c constant 32>
>>             unit size <integer_cst 0xf7d3d578 constant 4>
>>             align 32 symtab 0 alias set -1 canonical type 0xf7de12a0
>>             precision 32 min <integer_cst 0xf7d3d594 0>
>>             max <integer_cst 0xf7dde284 39>>
>>         pointer_to_this <pointer_type 0xf7e1b480>>
>>     addressable used ignored BLK file pr43513.c line 4 col 6
>>     size <integer_cst 0xf7d5d070 320> unit size <integer_cst 0xf7dde2a0 40>
>>     user align 32 context <function_decl 0xf7dfd480 foo3>>
>>
>>
>>>>
>>>>>> So should we try to find the base type of the vla, and use that, or use the
>>>>>> nonstandard char type?
>>>>>
>>>>> I don't think we can reliably find the base type of the vla - well,
>>>>> in practice we may because we control how we lower VLAs during
>>>>> gimplification, but nothing in the IL constraints say that the
>>>>> resulting pointer type should be special.
>>>>>
>>>>> Using a char[] decl shouldn't be a problem IMHO.
>>>>>
>>>>>>>>> And obviously you lose the optimization we arrange with inserting
>>>>>>>>> __builtin_stack_save/restore pairs that way - stack space will no
>>>>>>>>> longer be shared for subsequent VLAs.  Which means that you'd
>>>>>>>>> better limit the size you allow this promotion.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Right, I could introduce a parameter for this.
>>>>>>>
>>>>>>> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
>>>>>>> allow a size of PARAM_LARGE_STACK_FRAME / 10?
>>>>>>>
>>>>>>
>>>>>> That unfortunately is too small for the example from bug report. The default
>>>>>> value of the param is 250, so that would be a threshold of 25, and the alloca
>>>>>> size of the example is 40.  Perhaps we can try a threshold of
>>>>>> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?
>>>>>
>>>>> Hm.  estimated_stack_size is not O(1), so no.  I think we need to
>>>>> find a sensible way of allowing stack sharing.  Eventually Michas
>>>>> patch for introducing points-of-death would help here, if we'd
>>>>> go for folding this during stack-save/restore optimization.
>>>>>
>>>>
>>>> I changed the heuristics to this:
>>>>
>>>> +  /* Heuristic: don't fold large vlas.  */
>>>> +  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
>>>> +  /* In case a vla is declared at function scope, it has the same lifetime as a
>>>> +     declared array, so we allow a larger size.  */
>>>> +  block = gimple_block (stmt);
>>>> +  if (!(cfun->after_inlining
>>>> +        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
>>>> +    threshold /= 10;
>>>> +  if (size > threshold)
>>>> +    return NULL_TREE;
>>>>
>>>> The heuristics distinguishes between before and after inlining.
>>>>
>>>> After inlining, vla's declared at function scope have the same lifetimes as
>>>> declared arrays, and don't share their space. There should be no negative
>>>> effects from folding an alloca in this case, but for safety we set a threshold
>>>> of PARAM_LARGE_STACK_FRAME.
>>>>
>>>> Before inlining, such a vla might be inlined and share its space with another
>>>> vla, so we stick with the normal threshold before inlining.
>>>
>>> That sounds reasonable, though the block check should probably use the
>>> original VLA decl block, not that of the basic-block of the allocation,
>>> but unfortunately we don't have access to that.  So I suppose using
>>> the allocation basic-block BLOCK is good enough (still we don't
>>> really care about BLOCK boundaries when doing CFG manipulations, so
>>> the allocation bbs block may be not the outermost scope in more cases
>>> than necessary).
>>>
>>>> However, using this heuristic we still don't generate optimal code.
>>>>
>>>> During the first pass_ccp, the folding is not done, because the size (40) is
>>>> larger than the threshold 25. The threshold is 25, because inlining is not yet done.
>>>>
>>>> During pass_fold_builtins, the folding is done because it's after inlining, but
>>>> it's later than pass_iv_optimize, so that still doesn't yield the optimal size
>>>> of 64.
>>>>
>>>> The folding is not done during any of the other invocations or pass_ccp, because
>>>> the argument has already become constant in the earlier invocation.
>>>
>>> Yeah, that's the issue with relying on folding to do this transformation.
>>>
>>>> Using this change, I manage to trigger folding during the second invocation of
>>>> pass_ccp, before iv_optimize so we generate optimal code.
>>>>
>>>> Index: gcc/tree-ssa-ccp.c
>>>> ===================================================================
>>>> --- gcc/tree-ssa-ccp.c (revision 173734)
>>>> +++ gcc/tree-ssa-ccp.c (working copy)
>>>> @@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
>>>>  	if (gimple_call_internal_p (stmt))
>>>>  	  return false;
>>>>
>>>> +        /* The heuristic of fold_builtin_alloca differs before and after
>>>> +           inlining, so we don't require the arg to be changed into a constant
>>>> +           for folding, but just to be constant.  */
>>>> +        if (gimple_call_alloca_for_var_p (stmt)
>>>> +            && get_constant_value (gimple_call_arg (stmt, 0)))
>>>
>>> Probably reverse the get_constant_value check and the transformation
>>
>> Done.
>>
>>> (gimple_call_alloca_for_var_p isn't a predicate as it has side-effects,
>>> so its name should be changed).
>>>
>>>> +          return true;
>>>> +
>>>>  	/* Propagate into the call arguments.  Compared to replace_uses_in
>>>>  	   this can use the argument slot types for type verification
>>>>  	   instead of the current argument type.  We also can safely
>>>>
>>>> But, to me it feels like a hack. Do you have any ideas how to do this better?
>>>
>>> It's somewhat of a hack, but at least it is more of a defined place
>>> for this transformation - which then suggests to remove it from
>>> generic folding and only keep calling it from CCP this way.
>>>
>>
>> Done.
>>
> 
> This is an updated version of the patch. I have 2 new patches and an updated
> testcase which I will sent out individually.
> 

Previously stores to vla's were marked as alive in dce because
is_hidden_global_store triggered.  With them converted to stores to a normal
array, that doesn't happen anymore.
The stores where then sometimes removed, while there was an aliasing load, a
function argument. But since ref_may_be_aliased was called with WITH_SIZE_EXPR
as toplevel expr, ref_may_be_aliased returned false for the aliasing load.
This patch fixes this.

OK for trunk?

Thanks,
- Tom

2011-07-30  Tom de Vries  <tom@codesourcery.com>

	PR middle-end/43513
	* tree-ssa-dce.c (ref_may_be_aliased): Add assert.
	(propagate_necessity): Handle WITH_SIZE_EXPR call arg.

[-- Attachment #2: pr43513.4.dce.patch --]
[-- Type: text/x-patch, Size: 773 bytes --]

Index: gcc/tree-ssa-dce.c
===================================================================
--- gcc/tree-ssa-dce.c (revision 173734)
+++ gcc/tree-ssa-dce.c (working copy)
@@ -489,6 +489,7 @@ find_obviously_necessary_stmts (struct e
 static bool
 ref_may_be_aliased (tree ref)
 {
+  gcc_assert (TREE_CODE (ref) != WITH_SIZE_EXPR);
   while (handled_component_p (ref))
     ref = TREE_OPERAND (ref, 0);
   if (TREE_CODE (ref) == MEM_REF
@@ -839,6 +840,8 @@ propagate_necessity (struct edge_list *e
 		  if (TREE_CODE (arg) == SSA_NAME
 		      || is_gimple_min_invariant (arg))
 		    continue;
+		  if (TREE_CODE (arg) == WITH_SIZE_EXPR)
+		    arg = TREE_OPERAND (arg, 0);
 		  if (!ref_may_be_aliased (arg))
 		    mark_aliased_reaching_defs_necessary (stmt, arg);
 		}

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  2/3] Replace vla with array - Test case.
  2011-07-27 11:12   ` Tom de Vries
@ 2011-07-30 10:39     ` Tom de Vries
  0 siblings, 0 replies; 37+ messages in thread
From: Tom de Vries @ 2011-07-30 10:39 UTC (permalink / raw)
  To: rguenther; +Cc: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 767 bytes --]

On 07/27/2011 01:55 PM, Tom de Vries wrote:
> On 07/27/2011 01:54 PM, Tom de Vries wrote:
>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>> Hi Richard,
>>>
>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>
>>> 01_pr43513.3.patch
>>> 02_pr43513.3.test.patch
>>> 03_pr43513.3.mudflap.patch
>>>
>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>
>>> I will sent out the patches individually.
>>>
> 
> Sorry, with patch this time.
> 
>>
>> This patch adds the testcase from the bug report, modified not to
>> need includes.
>>

Updated test case, tranformation should happen during ccp2.

OK for trunk?

Thanks,
- Tom

2011-07-30  Tom de Vries  <tom@codesourcery.com>

	PR middle-end/43513
	* gcc.dg/pr43513.c: New test.

[-- Attachment #2: pr43513.4.test.patch --]
[-- Type: text/x-patch, Size: 626 bytes --]

Index: gcc/testsuite/gcc.dg/pr43513.c
===================================================================
--- gcc/testsuite/gcc.dg/pr43513.c	(revision 0)
+++ gcc/testsuite/gcc.dg/pr43513.c	(revision 0)
@@ -0,0 +1,19 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-ccp2" } */
+
+void bar (int *);
+void foo (char *, int);
+
+void
+foo3 ()
+{
+  const int kIterations = 10;
+  int results[kIterations];
+  int i;
+  bar (results);
+  for (i = 0; i < kIterations; i++)
+    foo ("%d ", results[i]);
+}
+
+/* { dg-final { scan-tree-dump-times "alloca" 0 "ccp2"} } */
+/* { dg-final { cleanup-tree-dump "ccp2" } } */

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-07-30  9:27                     ` Tom de Vries
@ 2011-07-30 13:50                       ` Richard Guenther
  0 siblings, 0 replies; 37+ messages in thread
From: Richard Guenther @ 2011-07-30 13:50 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Guenther, gcc-patches, Zdenek Dvorak

On Sat, Jul 30, 2011 at 9:34 AM, Tom de Vries <vries@codesourcery.com> wrote:
> On 07/30/2011 10:21 AM, Tom de Vries wrote:
>> Hi,
>>
>> On 07/28/2011 08:20 PM, Tom de Vries wrote:
>>> On 07/28/2011 06:25 PM, Richard Guenther wrote:
>>>> On Thu, 28 Jul 2011, Tom de Vries wrote:
>>>>
>>>>> On 07/28/2011 12:22 PM, Richard Guenther wrote:
>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>
>>>>>>> On 07/27/2011 05:27 PM, Richard Guenther wrote:
>>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>>
>>>>>>>>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
>>>>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>>>>
>>>>>>>>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>
>>>>>>>>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>>>>>>>>>>
>>>>>>>>>>>> 01_pr43513.3.patch
>>>>>>>>>>>> 02_pr43513.3.test.patch
>>>>>>>>>>>> 03_pr43513.3.mudflap.patch
>>>>>>>>>>>>
>>>>>>>>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>>>>>>>>>>
>>>>>>>>>>>> I will sent out the patches individually.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>>>>>>>>>>> array declaration.
>>>>>>>>>>>
>>>>>>>>>>> OK for trunk?
>>>>>>>>>>
>>>>>>>>>> I don't think it is safe to try to get at the VLA type the way you do.
>>>>>>>>>
>>>>>>>>> I don't understand in what way it's not safe. Do you mean I don't manage to find
>>>>>>>>> the type always, or that I find the wrong type, or something else?
>>>>>>>>
>>>>>>>> I think you might get the wrong type,
>>>>>>>
>>>>>>> Ok, I'll review that code one more time.
>>>>>>>
>>>>>>>> you also do not transform code
>>>>>>>> like
>>>>>>>>
>>>>>>>>   int *p = alloca(4);
>>>>>>>>   *p = 3;
>>>>>>>>
>>>>>>>> as there is no array type involved here.
>>>>>>>>
>>>>>>>
>>>>>>> I was trying to stay away from non-vla allocas.  A source declared alloca has
>>>>>>> function livetime, so we could have a single alloca in a loop, called 10 times,
>>>>>>> with all 10 instances live at the same time. This patch does not detect such
>>>>>>> cases, and thus stays away from non-vla allocas. A vla decl does not have such
>>>>>>> problems, the lifetime ends when it goes out of scope.
>>>>>>
>>>>>> Yes indeed - that probably would require more detailed analysis.
>>>>>>
>>>>>>>>>> In fact I would simply do sth like
>>>>>>>>>>
>>>>>>>>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>>>>>>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>>>>>>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>>>>>>>>   var = create_tmp_var (array_type, NULL);
>>>>>>>>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I tried this code on the example, and it works, but the newly declared type has
>>>>>>>>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
>>>>>>>>> the memory access in the example potentially unaligned, which prohibits an
>>>>>>>>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
>>>>>>>>> with my current patch.
>>>>>>>>
>>>>>>>> Ok, so then set DECL_ALIGN of the variable to something reasonable
>>>>>>>> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
>>>>>>>> alignment that the targets alloca function would guarantee.
>>>>>>>>
>>>>>>>
>>>>>>> I tried that, but that doesn't help. It's the alignment of the type that
>>>>>>> matters, not of the decl.
>>>>>>
>>>>>> It shouldn't.  All accesses are performed with the original types and
>>>>>> alignment comes from that (plus the underlying decl).
>>>>>>
>>>>>
>>>>> I managed to get it all working by using build_aligned_type rather that DECL_ALIGN.
>>>>
>>>> That's really odd, DECL_ALIGN should just work - nothing refers to the
>>>> type of the decl in the IL.  Can you try also setting DECL_USER_ALIGN to
>>>> 1 maybe?
>>>>
>>>
>>> This doesn't work either.
>>>
>>>   /* Declare array.  */
>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>   align = MIN (size * 8, GET_MODE_PRECISION (word_mode));
>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>   var = create_tmp_var (array_type, NULL);
>>>   DECL_ALIGN (var) = align;
>>>   DECL_USER_ALIGN (var) = 1;
>>>
>>> Maybe this clarifies it:
>>>
>>> Breakpoint 1, may_be_unaligned_p (ref=0xf7d9d410, step=0xf7d3d578) at
>>> /home/vries/local/google/src/gcc-mainline/gcc/tree-ssa-loop-ivopts.c:1621
>>> (gdb) call debug_generic_expr (ref)
>>> MEM[(int[0:D.2579] *)&D.2595][0]
>>> (gdb) call debug_generic_expr (step)
>>> 4
>>>
>>> 1627   base = get_inner_reference (ref, &bitsize, &bitpos, &toffset, &mode,
>>> (gdb) call debug_generic_expr (base)
>>> D.2595
>>>
>>> 1629   base_type = TREE_TYPE (base);
>>> (gdb) call debug_generic_expr (base_type)
>>> <unnamed-unsigned:8>[40]
>>>
>>> 1630   base_align = TYPE_ALIGN (base_type);
>>> (gdb) p base_align
>>> $1 = 8
>>>
>>> So the align is 8-bits, and we return true here:
>>>
>>> (gdb) n
>>> 1632   if (mode != BLKmode)
>>> (gdb) n
>>> 1634       unsigned mode_align = GET_MODE_ALIGNMENT (mode);
>>> (gdb)
>>> 1636       if (base_align < mode_align
>>> (gdb)
>>> 1639         return true;
>>>
>>>
>>> Here we can see that the base actually has the (user) align on it:
>>>
>>> (gdb) call debug_tree (base)
>>>  <var_decl 0xf7e1b420 D.2595
>>>     type <array_type 0xf7e1b360
>>>         type <integer_type 0xf7e1b2a0 public unsigned QI
>>>             size <integer_cst 0xf7d3d604 constant 8>
>>>             unit size <integer_cst 0xf7d3d620 constant 1>
>>>             align 8 symtab 0 alias set -1 canonical type 0xf7e1b2a0 precision 8
>>>             min <integer_cst 0xf7dffaf0 0> max <integer_cst 0xf7dffb0c 255>
>>>             pointer_to_this <pointer_type 0xf7e1b3c0>>
>>>         BLK
>>>         size <integer_cst 0xf7d5d070 constant 320>
>>>         unit size <integer_cst 0xf7dde2a0 constant 40>
>>>         align 8 symtab 0 alias set -1 canonical type 0xf7e1b360
>>>         domain <integer_type 0xf7de12a0
>>>                 type <integer_type 0xf7d51000 unsigned int>
>>>             SI
>>>             size <integer_cst 0xf7d3d78c constant 32>
>>>             unit size <integer_cst 0xf7d3d578 constant 4>
>>>             align 32 symtab 0 alias set -1 canonical type 0xf7de12a0
>>>             precision 32 min <integer_cst 0xf7d3d594 0>
>>>             max <integer_cst 0xf7dde284 39>>
>>>         pointer_to_this <pointer_type 0xf7e1b480>>
>>>     addressable used ignored BLK file pr43513.c line 4 col 6
>>>     size <integer_cst 0xf7d5d070 320> unit size <integer_cst 0xf7dde2a0 40>
>>>     user align 32 context <function_decl 0xf7dfd480 foo3>>
>>>
>>>
>>>>>
>>>>>>> So should we try to find the base type of the vla, and use that, or use the
>>>>>>> nonstandard char type?
>>>>>>
>>>>>> I don't think we can reliably find the base type of the vla - well,
>>>>>> in practice we may because we control how we lower VLAs during
>>>>>> gimplification, but nothing in the IL constraints say that the
>>>>>> resulting pointer type should be special.
>>>>>>
>>>>>> Using a char[] decl shouldn't be a problem IMHO.
>>>>>>
>>>>>>>>>> And obviously you lose the optimization we arrange with inserting
>>>>>>>>>> __builtin_stack_save/restore pairs that way - stack space will no
>>>>>>>>>> longer be shared for subsequent VLAs.  Which means that you'd
>>>>>>>>>> better limit the size you allow this promotion.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Right, I could introduce a parameter for this.
>>>>>>>>
>>>>>>>> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
>>>>>>>> allow a size of PARAM_LARGE_STACK_FRAME / 10?
>>>>>>>>
>>>>>>>
>>>>>>> That unfortunately is too small for the example from bug report. The default
>>>>>>> value of the param is 250, so that would be a threshold of 25, and the alloca
>>>>>>> size of the example is 40.  Perhaps we can try a threshold of
>>>>>>> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?
>>>>>>
>>>>>> Hm.  estimated_stack_size is not O(1), so no.  I think we need to
>>>>>> find a sensible way of allowing stack sharing.  Eventually Michas
>>>>>> patch for introducing points-of-death would help here, if we'd
>>>>>> go for folding this during stack-save/restore optimization.
>>>>>>
>>>>>
>>>>> I changed the heuristics to this:
>>>>>
>>>>> +  /* Heuristic: don't fold large vlas.  */
>>>>> +  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
>>>>> +  /* In case a vla is declared at function scope, it has the same lifetime as a
>>>>> +     declared array, so we allow a larger size.  */
>>>>> +  block = gimple_block (stmt);
>>>>> +  if (!(cfun->after_inlining
>>>>> +        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
>>>>> +    threshold /= 10;
>>>>> +  if (size > threshold)
>>>>> +    return NULL_TREE;
>>>>>
>>>>> The heuristics distinguishes between before and after inlining.
>>>>>
>>>>> After inlining, vla's declared at function scope have the same lifetimes as
>>>>> declared arrays, and don't share their space. There should be no negative
>>>>> effects from folding an alloca in this case, but for safety we set a threshold
>>>>> of PARAM_LARGE_STACK_FRAME.
>>>>>
>>>>> Before inlining, such a vla might be inlined and share its space with another
>>>>> vla, so we stick with the normal threshold before inlining.
>>>>
>>>> That sounds reasonable, though the block check should probably use the
>>>> original VLA decl block, not that of the basic-block of the allocation,
>>>> but unfortunately we don't have access to that.  So I suppose using
>>>> the allocation basic-block BLOCK is good enough (still we don't
>>>> really care about BLOCK boundaries when doing CFG manipulations, so
>>>> the allocation bbs block may be not the outermost scope in more cases
>>>> than necessary).
>>>>
>>>>> However, using this heuristic we still don't generate optimal code.
>>>>>
>>>>> During the first pass_ccp, the folding is not done, because the size (40) is
>>>>> larger than the threshold 25. The threshold is 25, because inlining is not yet done.
>>>>>
>>>>> During pass_fold_builtins, the folding is done because it's after inlining, but
>>>>> it's later than pass_iv_optimize, so that still doesn't yield the optimal size
>>>>> of 64.
>>>>>
>>>>> The folding is not done during any of the other invocations or pass_ccp, because
>>>>> the argument has already become constant in the earlier invocation.
>>>>
>>>> Yeah, that's the issue with relying on folding to do this transformation.
>>>>
>>>>> Using this change, I manage to trigger folding during the second invocation of
>>>>> pass_ccp, before iv_optimize so we generate optimal code.
>>>>>
>>>>> Index: gcc/tree-ssa-ccp.c
>>>>> ===================================================================
>>>>> --- gcc/tree-ssa-ccp.c (revision 173734)
>>>>> +++ gcc/tree-ssa-ccp.c (working copy)
>>>>> @@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
>>>>>    if (gimple_call_internal_p (stmt))
>>>>>      return false;
>>>>>
>>>>> +        /* The heuristic of fold_builtin_alloca differs before and after
>>>>> +           inlining, so we don't require the arg to be changed into a constant
>>>>> +           for folding, but just to be constant.  */
>>>>> +        if (gimple_call_alloca_for_var_p (stmt)
>>>>> +            && get_constant_value (gimple_call_arg (stmt, 0)))
>>>>
>>>> Probably reverse the get_constant_value check and the transformation
>>>
>>> Done.
>>>
>>>> (gimple_call_alloca_for_var_p isn't a predicate as it has side-effects,
>>>> so its name should be changed).
>>>>
>>>>> +          return true;
>>>>> +
>>>>>    /* Propagate into the call arguments.  Compared to replace_uses_in
>>>>>       this can use the argument slot types for type verification
>>>>>       instead of the current argument type.  We also can safely
>>>>>
>>>>> But, to me it feels like a hack. Do you have any ideas how to do this better?
>>>>
>>>> It's somewhat of a hack, but at least it is more of a defined place
>>>> for this transformation - which then suggests to remove it from
>>>> generic folding and only keep calling it from CCP this way.
>>>>
>>>
>>> Done.
>>>
>>
>> This is an updated version of the patch. I have 2 new patches and an updated
>> testcase which I will sent out individually.
>>
>
> Previously stores to vla's were marked as alive in dce because
> is_hidden_global_store triggered.  With them converted to stores to a normal
> array, that doesn't happen anymore.
> The stores where then sometimes removed, while there was an aliasing load, a
> function argument. But since ref_may_be_aliased was called with WITH_SIZE_EXPR
> as toplevel expr, ref_may_be_aliased returned false for the aliasing load.
> This patch fixes this.
>
> OK for trunk?

Oops.

Ok.

Thanks,
Richard.

> Thanks,
> - Tom
>
> 2011-07-30  Tom de Vries  <tom@codesourcery.com>
>
>        PR middle-end/43513
>        * tree-ssa-dce.c (ref_may_be_aliased): Add assert.
>        (propagate_necessity): Handle WITH_SIZE_EXPR call arg.
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-07-30  9:23                     ` Tom de Vries
@ 2011-07-30 14:22                       ` Richard Guenther
  0 siblings, 0 replies; 37+ messages in thread
From: Richard Guenther @ 2011-07-30 14:22 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Guenther, gcc-patches, Zdenek Dvorak

On Sat, Jul 30, 2011 at 9:24 AM, Tom de Vries <vries@codesourcery.com> wrote:
> On 07/30/2011 10:21 AM, Tom de Vries wrote:
>> Hi,
>>
>> On 07/28/2011 08:20 PM, Tom de Vries wrote:
>>> On 07/28/2011 06:25 PM, Richard Guenther wrote:
>>>> On Thu, 28 Jul 2011, Tom de Vries wrote:
>>>>
>>>>> On 07/28/2011 12:22 PM, Richard Guenther wrote:
>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>
>>>>>>> On 07/27/2011 05:27 PM, Richard Guenther wrote:
>>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>>
>>>>>>>>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
>>>>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>>>>
>>>>>>>>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>>
>>>>>>>>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>>>>>>>>>>
>>>>>>>>>>>> 01_pr43513.3.patch
>>>>>>>>>>>> 02_pr43513.3.test.patch
>>>>>>>>>>>> 03_pr43513.3.mudflap.patch
>>>>>>>>>>>>
>>>>>>>>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>>>>>>>>>>
>>>>>>>>>>>> I will sent out the patches individually.
>>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>>>>>>>>>>> array declaration.
>>>>>>>>>>>
>>>>>>>>>>> OK for trunk?
>>>>>>>>>>
>>>>>>>>>> I don't think it is safe to try to get at the VLA type the way you do.
>>>>>>>>>
>>>>>>>>> I don't understand in what way it's not safe. Do you mean I don't manage to find
>>>>>>>>> the type always, or that I find the wrong type, or something else?
>>>>>>>>
>>>>>>>> I think you might get the wrong type,
>>>>>>>
>>>>>>> Ok, I'll review that code one more time.
>>>>>>>
>>>>>>>> you also do not transform code
>>>>>>>> like
>>>>>>>>
>>>>>>>>   int *p = alloca(4);
>>>>>>>>   *p = 3;
>>>>>>>>
>>>>>>>> as there is no array type involved here.
>>>>>>>>
>>>>>>>
>>>>>>> I was trying to stay away from non-vla allocas.  A source declared alloca has
>>>>>>> function livetime, so we could have a single alloca in a loop, called 10 times,
>>>>>>> with all 10 instances live at the same time. This patch does not detect such
>>>>>>> cases, and thus stays away from non-vla allocas. A vla decl does not have such
>>>>>>> problems, the lifetime ends when it goes out of scope.
>>>>>>
>>>>>> Yes indeed - that probably would require more detailed analysis.
>>>>>>
>>>>>>>>>> In fact I would simply do sth like
>>>>>>>>>>
>>>>>>>>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>>>>>>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>>>>>>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>>>>>>>>   var = create_tmp_var (array_type, NULL);
>>>>>>>>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> I tried this code on the example, and it works, but the newly declared type has
>>>>>>>>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
>>>>>>>>> the memory access in the example potentially unaligned, which prohibits an
>>>>>>>>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
>>>>>>>>> with my current patch.
>>>>>>>>
>>>>>>>> Ok, so then set DECL_ALIGN of the variable to something reasonable
>>>>>>>> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
>>>>>>>> alignment that the targets alloca function would guarantee.
>>>>>>>>
>>>>>>>
>>>>>>> I tried that, but that doesn't help. It's the alignment of the type that
>>>>>>> matters, not of the decl.
>>>>>>
>>>>>> It shouldn't.  All accesses are performed with the original types and
>>>>>> alignment comes from that (plus the underlying decl).
>>>>>>
>>>>>
>>>>> I managed to get it all working by using build_aligned_type rather that DECL_ALIGN.
>>>>
>>>> That's really odd, DECL_ALIGN should just work - nothing refers to the
>>>> type of the decl in the IL.  Can you try also setting DECL_USER_ALIGN to
>>>> 1 maybe?
>>>>
>>>
>>> This doesn't work either.
>>>
>>>   /* Declare array.  */
>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>   align = MIN (size * 8, GET_MODE_PRECISION (word_mode));
>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>   var = create_tmp_var (array_type, NULL);
>>>   DECL_ALIGN (var) = align;
>>>   DECL_USER_ALIGN (var) = 1;
>>>
>>> Maybe this clarifies it:
>>>
>>> Breakpoint 1, may_be_unaligned_p (ref=0xf7d9d410, step=0xf7d3d578) at
>>> /home/vries/local/google/src/gcc-mainline/gcc/tree-ssa-loop-ivopts.c:1621
>>> (gdb) call debug_generic_expr (ref)
>>> MEM[(int[0:D.2579] *)&D.2595][0]
>>> (gdb) call debug_generic_expr (step)
>>> 4
>>>
>>> 1627   base = get_inner_reference (ref, &bitsize, &bitpos, &toffset, &mode,
>>> (gdb) call debug_generic_expr (base)
>>> D.2595
>>>
>>> 1629   base_type = TREE_TYPE (base);
>>> (gdb) call debug_generic_expr (base_type)
>>> <unnamed-unsigned:8>[40]
>>>
>>> 1630   base_align = TYPE_ALIGN (base_type);
>>> (gdb) p base_align
>>> $1 = 8
>>>
>>> So the align is 8-bits, and we return true here:
>>>
>>> (gdb) n
>>> 1632   if (mode != BLKmode)
>>> (gdb) n
>>> 1634       unsigned mode_align = GET_MODE_ALIGNMENT (mode);
>>> (gdb)
>>> 1636       if (base_align < mode_align
>>> (gdb)
>>> 1639         return true;
>>>
>>>
>>> Here we can see that the base actually has the (user) align on it:
>>>
>>> (gdb) call debug_tree (base)
>>>  <var_decl 0xf7e1b420 D.2595
>>>     type <array_type 0xf7e1b360
>>>         type <integer_type 0xf7e1b2a0 public unsigned QI
>>>             size <integer_cst 0xf7d3d604 constant 8>
>>>             unit size <integer_cst 0xf7d3d620 constant 1>
>>>             align 8 symtab 0 alias set -1 canonical type 0xf7e1b2a0 precision 8
>>>             min <integer_cst 0xf7dffaf0 0> max <integer_cst 0xf7dffb0c 255>
>>>             pointer_to_this <pointer_type 0xf7e1b3c0>>
>>>         BLK
>>>         size <integer_cst 0xf7d5d070 constant 320>
>>>         unit size <integer_cst 0xf7dde2a0 constant 40>
>>>         align 8 symtab 0 alias set -1 canonical type 0xf7e1b360
>>>         domain <integer_type 0xf7de12a0
>>>                 type <integer_type 0xf7d51000 unsigned int>
>>>             SI
>>>             size <integer_cst 0xf7d3d78c constant 32>
>>>             unit size <integer_cst 0xf7d3d578 constant 4>
>>>             align 32 symtab 0 alias set -1 canonical type 0xf7de12a0
>>>             precision 32 min <integer_cst 0xf7d3d594 0>
>>>             max <integer_cst 0xf7dde284 39>>
>>>         pointer_to_this <pointer_type 0xf7e1b480>>
>>>     addressable used ignored BLK file pr43513.c line 4 col 6
>>>     size <integer_cst 0xf7d5d070 320> unit size <integer_cst 0xf7dde2a0 40>
>>>     user align 32 context <function_decl 0xf7dfd480 foo3>>
>>>
>>>
>>>>>
>>>>>>> So should we try to find the base type of the vla, and use that, or use the
>>>>>>> nonstandard char type?
>>>>>>
>>>>>> I don't think we can reliably find the base type of the vla - well,
>>>>>> in practice we may because we control how we lower VLAs during
>>>>>> gimplification, but nothing in the IL constraints say that the
>>>>>> resulting pointer type should be special.
>>>>>>
>>>>>> Using a char[] decl shouldn't be a problem IMHO.
>>>>>>
>>>>>>>>>> And obviously you lose the optimization we arrange with inserting
>>>>>>>>>> __builtin_stack_save/restore pairs that way - stack space will no
>>>>>>>>>> longer be shared for subsequent VLAs.  Which means that you'd
>>>>>>>>>> better limit the size you allow this promotion.
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Right, I could introduce a parameter for this.
>>>>>>>>
>>>>>>>> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
>>>>>>>> allow a size of PARAM_LARGE_STACK_FRAME / 10?
>>>>>>>>
>>>>>>>
>>>>>>> That unfortunately is too small for the example from bug report. The default
>>>>>>> value of the param is 250, so that would be a threshold of 25, and the alloca
>>>>>>> size of the example is 40.  Perhaps we can try a threshold of
>>>>>>> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?
>>>>>>
>>>>>> Hm.  estimated_stack_size is not O(1), so no.  I think we need to
>>>>>> find a sensible way of allowing stack sharing.  Eventually Michas
>>>>>> patch for introducing points-of-death would help here, if we'd
>>>>>> go for folding this during stack-save/restore optimization.
>>>>>>
>>>>>
>>>>> I changed the heuristics to this:
>>>>>
>>>>> +  /* Heuristic: don't fold large vlas.  */
>>>>> +  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
>>>>> +  /* In case a vla is declared at function scope, it has the same lifetime as a
>>>>> +     declared array, so we allow a larger size.  */
>>>>> +  block = gimple_block (stmt);
>>>>> +  if (!(cfun->after_inlining
>>>>> +        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
>>>>> +    threshold /= 10;
>>>>> +  if (size > threshold)
>>>>> +    return NULL_TREE;
>>>>>
>>>>> The heuristics distinguishes between before and after inlining.
>>>>>
>>>>> After inlining, vla's declared at function scope have the same lifetimes as
>>>>> declared arrays, and don't share their space. There should be no negative
>>>>> effects from folding an alloca in this case, but for safety we set a threshold
>>>>> of PARAM_LARGE_STACK_FRAME.
>>>>>
>>>>> Before inlining, such a vla might be inlined and share its space with another
>>>>> vla, so we stick with the normal threshold before inlining.
>>>>
>>>> That sounds reasonable, though the block check should probably use the
>>>> original VLA decl block, not that of the basic-block of the allocation,
>>>> but unfortunately we don't have access to that.  So I suppose using
>>>> the allocation basic-block BLOCK is good enough (still we don't
>>>> really care about BLOCK boundaries when doing CFG manipulations, so
>>>> the allocation bbs block may be not the outermost scope in more cases
>>>> than necessary).
>>>>
>>>>> However, using this heuristic we still don't generate optimal code.
>>>>>
>>>>> During the first pass_ccp, the folding is not done, because the size (40) is
>>>>> larger than the threshold 25. The threshold is 25, because inlining is not yet done.
>>>>>
>>>>> During pass_fold_builtins, the folding is done because it's after inlining, but
>>>>> it's later than pass_iv_optimize, so that still doesn't yield the optimal size
>>>>> of 64.
>>>>>
>>>>> The folding is not done during any of the other invocations or pass_ccp, because
>>>>> the argument has already become constant in the earlier invocation.
>>>>
>>>> Yeah, that's the issue with relying on folding to do this transformation.
>>>>
>>>>> Using this change, I manage to trigger folding during the second invocation of
>>>>> pass_ccp, before iv_optimize so we generate optimal code.
>>>>>
>>>>> Index: gcc/tree-ssa-ccp.c
>>>>> ===================================================================
>>>>> --- gcc/tree-ssa-ccp.c (revision 173734)
>>>>> +++ gcc/tree-ssa-ccp.c (working copy)
>>>>> @@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
>>>>>    if (gimple_call_internal_p (stmt))
>>>>>      return false;
>>>>>
>>>>> +        /* The heuristic of fold_builtin_alloca differs before and after
>>>>> +           inlining, so we don't require the arg to be changed into a constant
>>>>> +           for folding, but just to be constant.  */
>>>>> +        if (gimple_call_alloca_for_var_p (stmt)
>>>>> +            && get_constant_value (gimple_call_arg (stmt, 0)))
>>>>
>>>> Probably reverse the get_constant_value check and the transformation
>>>
>>> Done.
>>>
>>>> (gimple_call_alloca_for_var_p isn't a predicate as it has side-effects,
>>>> so its name should be changed).
>>>>
>>>>> +          return true;
>>>>> +
>>>>>    /* Propagate into the call arguments.  Compared to replace_uses_in
>>>>>       this can use the argument slot types for type verification
>>>>>       instead of the current argument type.  We also can safely
>>>>>
>>>>> But, to me it feels like a hack. Do you have any ideas how to do this better?
>>>>
>>>> It's somewhat of a hack, but at least it is more of a defined place
>>>> for this transformation - which then suggests to remove it from
>>>> generic folding and only keep calling it from CCP this way.
>>>>
>>>
>>> Done.
>>>
>>
>> This is an updated version of the patch. I have 2 new patches and an updated
>> testcase which I will sent out individually.
>>
>
> 2011-07-30  Tom de Vries  <tom@codesourcery.com>
>
>        PR middle-end/43513
>        * tree-ssa-loop-ivopts.c (may_be_unaligned_p): Use get_object_alignment.

I think it should do

   base_type = TREE_TYPE (base);
   base_align = get_object_alignment (base, BIGGEST_ALIGNMENT);
   base_align = MAX (base_align, TYPE_ALIGN (base_type));

for alignment that is less than the alignment of the type we rely on people
building variant types.  get_object_alignment is too conservative to be used
alone.

Ok with the above change.

As further improvement the code could use get_object_alignment_1 on the
full reference to also track misalignment using that function, removing the
seemingly duplicate code in may_be_unaligned_p.

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-30  9:01                   ` Tom de Vries
  2011-07-30  9:23                     ` Tom de Vries
  2011-07-30  9:27                     ` Tom de Vries
@ 2011-08-07 12:00                     ` Tom de Vries
  2011-08-17 12:46                       ` [PING][PATCH PR43513] Replace vla with array Tom de Vries
  2011-08-24  9:31                     ` [PING][PATCH PR43513, 1/3] Replace vla with array - Implementation Tom de Vries
                                       ` (2 subsequent siblings)
  5 siblings, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-08-07 12:00 UTC (permalink / raw)
  To: Richard Guenther; +Cc: gcc-patches, Zdenek Dvorak

On 07/30/2011 09:21 AM, Tom de Vries wrote:
> Hi,
> 
> On 07/28/2011 08:20 PM, Tom de Vries wrote:
>> On 07/28/2011 06:25 PM, Richard Guenther wrote:
>>> On Thu, 28 Jul 2011, Tom de Vries wrote:
>>>
>>>> On 07/28/2011 12:22 PM, Richard Guenther wrote:
>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>
>>>>>> On 07/27/2011 05:27 PM, Richard Guenther wrote:
>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>
>>>>>>>> On 07/27/2011 02:12 PM, Richard Guenther wrote:
>>>>>>>>> On Wed, 27 Jul 2011, Tom de Vries wrote:
>>>>>>>>>
>>>>>>>>>> On 07/27/2011 01:50 PM, Tom de Vries wrote:
>>>>>>>>>>> Hi Richard,
>>>>>>>>>>>
>>>>>>>>>>> I have a patch set for bug 43513 - The stack pointer is adjusted twice.
>>>>>>>>>>>
>>>>>>>>>>> 01_pr43513.3.patch
>>>>>>>>>>> 02_pr43513.3.test.patch
>>>>>>>>>>> 03_pr43513.3.mudflap.patch
>>>>>>>>>>>
>>>>>>>>>>> The patch set has been bootstrapped and reg-tested on x86_64.
>>>>>>>>>>>
>>>>>>>>>>> I will sent out the patches individually.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> The patch replaces a vla __builtin_alloca that has a constant argument with an
>>>>>>>>>> array declaration.
>>>>>>>>>>
>>>>>>>>>> OK for trunk?
>>>>>>>>>
>>>>>>>>> I don't think it is safe to try to get at the VLA type the way you do.
>>>>>>>>
>>>>>>>> I don't understand in what way it's not safe. Do you mean I don't manage to find
>>>>>>>> the type always, or that I find the wrong type, or something else?
>>>>>>>
>>>>>>> I think you might get the wrong type,
>>>>>>
>>>>>> Ok, I'll review that code one more time.
>>>>>>
>>>>>>> you also do not transform code
>>>>>>> like
>>>>>>>
>>>>>>>   int *p = alloca(4);
>>>>>>>   *p = 3;
>>>>>>>
>>>>>>> as there is no array type involved here.
>>>>>>>
>>>>>>
>>>>>> I was trying to stay away from non-vla allocas.  A source declared alloca has
>>>>>> function livetime, so we could have a single alloca in a loop, called 10 times,
>>>>>> with all 10 instances live at the same time. This patch does not detect such
>>>>>> cases, and thus stays away from non-vla allocas. A vla decl does not have such
>>>>>> problems, the lifetime ends when it goes out of scope.
>>>>>
>>>>> Yes indeed - that probably would require more detailed analysis.
>>>>>
>>>>>>>>> In fact I would simply do sth like
>>>>>>>>>
>>>>>>>>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>>>>>>>>   n_elem = size * 8 / BITS_PER_UNIT;
>>>>>>>>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>>>>>>>>   var = create_tmp_var (array_type, NULL);
>>>>>>>>>   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
>>>>>>>>>
>>>>>>>>
>>>>>>>> I tried this code on the example, and it works, but the newly declared type has
>>>>>>>> an 8-bit alignment, while the vla base type has a 32 bit alignment.  This make
>>>>>>>> the memory access in the example potentially unaligned, which prohibits an
>>>>>>>> ivopts optimization, so the resulting text size is 68 instead of the 64 achieved
>>>>>>>> with my current patch.
>>>>>>>
>>>>>>> Ok, so then set DECL_ALIGN of the variable to something reasonable
>>>>>>> like MIN (size * 8, GET_MODE_PRECISION (word_mode)).  Basically the
>>>>>>> alignment that the targets alloca function would guarantee.
>>>>>>>
>>>>>>
>>>>>> I tried that, but that doesn't help. It's the alignment of the type that
>>>>>> matters, not of the decl.
>>>>>
>>>>> It shouldn't.  All accesses are performed with the original types and
>>>>> alignment comes from that (plus the underlying decl).
>>>>>
>>>>
>>>> I managed to get it all working by using build_aligned_type rather that DECL_ALIGN.
>>>
>>> That's really odd, DECL_ALIGN should just work - nothing refers to the
>>> type of the decl in the IL.  Can you try also setting DECL_USER_ALIGN to 
>>> 1 maybe?
>>>
>>
>> This doesn't work either.
>>
>>   /* Declare array.  */
>>   elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
>>   n_elem = size * 8 / BITS_PER_UNIT;
>>   align = MIN (size * 8, GET_MODE_PRECISION (word_mode));
>>   array_type = build_array_type_nelts (elem_type, n_elem);
>>   var = create_tmp_var (array_type, NULL);
>>   DECL_ALIGN (var) = align;
>>   DECL_USER_ALIGN (var) = 1;
>>
>> Maybe this clarifies it:
>>
>> Breakpoint 1, may_be_unaligned_p (ref=0xf7d9d410, step=0xf7d3d578) at
>> /home/vries/local/google/src/gcc-mainline/gcc/tree-ssa-loop-ivopts.c:1621
>> (gdb) call debug_generic_expr (ref)
>> MEM[(int[0:D.2579] *)&D.2595][0]
>> (gdb) call debug_generic_expr (step)
>> 4
>>
>> 1627	  base = get_inner_reference (ref, &bitsize, &bitpos, &toffset, &mode,
>> (gdb) call debug_generic_expr (base)
>> D.2595
>>
>> 1629	  base_type = TREE_TYPE (base);
>> (gdb) call debug_generic_expr (base_type)
>> <unnamed-unsigned:8>[40]
>>
>> 1630	  base_align = TYPE_ALIGN (base_type);
>> (gdb) p base_align
>> $1 = 8
>>
>> So the align is 8-bits, and we return true here:
>>
>> (gdb) n
>> 1632	  if (mode != BLKmode)
>> (gdb) n
>> 1634	      unsigned mode_align = GET_MODE_ALIGNMENT (mode);
>> (gdb)
>> 1636	      if (base_align < mode_align
>> (gdb)
>> 1639		return true;
>>
>>
>> Here we can see that the base actually has the (user) align on it:
>>
>> (gdb) call debug_tree (base)
>>  <var_decl 0xf7e1b420 D.2595
>>     type <array_type 0xf7e1b360
>>         type <integer_type 0xf7e1b2a0 public unsigned QI
>>             size <integer_cst 0xf7d3d604 constant 8>
>>             unit size <integer_cst 0xf7d3d620 constant 1>
>>             align 8 symtab 0 alias set -1 canonical type 0xf7e1b2a0 precision 8
>>             min <integer_cst 0xf7dffaf0 0> max <integer_cst 0xf7dffb0c 255>
>>             pointer_to_this <pointer_type 0xf7e1b3c0>>
>>         BLK
>>         size <integer_cst 0xf7d5d070 constant 320>
>>         unit size <integer_cst 0xf7dde2a0 constant 40>
>>         align 8 symtab 0 alias set -1 canonical type 0xf7e1b360
>>         domain <integer_type 0xf7de12a0
>>                 type <integer_type 0xf7d51000 unsigned int>
>>             SI
>>             size <integer_cst 0xf7d3d78c constant 32>
>>             unit size <integer_cst 0xf7d3d578 constant 4>
>>             align 32 symtab 0 alias set -1 canonical type 0xf7de12a0
>>             precision 32 min <integer_cst 0xf7d3d594 0>
>>             max <integer_cst 0xf7dde284 39>>
>>         pointer_to_this <pointer_type 0xf7e1b480>>
>>     addressable used ignored BLK file pr43513.c line 4 col 6
>>     size <integer_cst 0xf7d5d070 320> unit size <integer_cst 0xf7dde2a0 40>
>>     user align 32 context <function_decl 0xf7dfd480 foo3>>
>>
>>
>>>>
>>>>>> So should we try to find the base type of the vla, and use that, or use the
>>>>>> nonstandard char type?
>>>>>
>>>>> I don't think we can reliably find the base type of the vla - well,
>>>>> in practice we may because we control how we lower VLAs during
>>>>> gimplification, but nothing in the IL constraints say that the
>>>>> resulting pointer type should be special.
>>>>>
>>>>> Using a char[] decl shouldn't be a problem IMHO.
>>>>>
>>>>>>>>> And obviously you lose the optimization we arrange with inserting
>>>>>>>>> __builtin_stack_save/restore pairs that way - stack space will no
>>>>>>>>> longer be shared for subsequent VLAs.  Which means that you'd
>>>>>>>>> better limit the size you allow this promotion.
>>>>>>>>>
>>>>>>>>
>>>>>>>> Right, I could introduce a parameter for this.
>>>>>>>
>>>>>>> I would think you could use PARAM_LARGE_STACK_FRAME for now and say,
>>>>>>> allow a size of PARAM_LARGE_STACK_FRAME / 10?
>>>>>>>
>>>>>>
>>>>>> That unfortunately is too small for the example from bug report. The default
>>>>>> value of the param is 250, so that would be a threshold of 25, and the alloca
>>>>>> size of the example is 40.  Perhaps we can try a threshold of
>>>>>> PARAM_LARGE_STACK_FRAME - estimated_stack_size or some such?
>>>>>
>>>>> Hm.  estimated_stack_size is not O(1), so no.  I think we need to
>>>>> find a sensible way of allowing stack sharing.  Eventually Michas
>>>>> patch for introducing points-of-death would help here, if we'd
>>>>> go for folding this during stack-save/restore optimization.
>>>>>
>>>>
>>>> I changed the heuristics to this:
>>>>
>>>> +  /* Heuristic: don't fold large vlas.  */
>>>> +  threshold = (unsigned HOST_WIDE_INT)PARAM_VALUE (PARAM_LARGE_STACK_FRAME);
>>>> +  /* In case a vla is declared at function scope, it has the same lifetime as a
>>>> +     declared array, so we allow a larger size.  */
>>>> +  block = gimple_block (stmt);
>>>> +  if (!(cfun->after_inlining
>>>> +        && TREE_CODE (BLOCK_SUPERCONTEXT (block)) == FUNCTION_DECL))
>>>> +    threshold /= 10;
>>>> +  if (size > threshold)
>>>> +    return NULL_TREE;
>>>>
>>>> The heuristics distinguishes between before and after inlining.
>>>>
>>>> After inlining, vla's declared at function scope have the same lifetimes as
>>>> declared arrays, and don't share their space. There should be no negative
>>>> effects from folding an alloca in this case, but for safety we set a threshold
>>>> of PARAM_LARGE_STACK_FRAME.
>>>>
>>>> Before inlining, such a vla might be inlined and share its space with another
>>>> vla, so we stick with the normal threshold before inlining.
>>>
>>> That sounds reasonable, though the block check should probably use the
>>> original VLA decl block, not that of the basic-block of the allocation,
>>> but unfortunately we don't have access to that.  So I suppose using
>>> the allocation basic-block BLOCK is good enough (still we don't
>>> really care about BLOCK boundaries when doing CFG manipulations, so
>>> the allocation bbs block may be not the outermost scope in more cases
>>> than necessary).
>>>
>>>> However, using this heuristic we still don't generate optimal code.
>>>>
>>>> During the first pass_ccp, the folding is not done, because the size (40) is
>>>> larger than the threshold 25. The threshold is 25, because inlining is not yet done.
>>>>
>>>> During pass_fold_builtins, the folding is done because it's after inlining, but
>>>> it's later than pass_iv_optimize, so that still doesn't yield the optimal size
>>>> of 64.
>>>>
>>>> The folding is not done during any of the other invocations or pass_ccp, because
>>>> the argument has already become constant in the earlier invocation.
>>>
>>> Yeah, that's the issue with relying on folding to do this transformation.
>>>
>>>> Using this change, I manage to trigger folding during the second invocation of
>>>> pass_ccp, before iv_optimize so we generate optimal code.
>>>>
>>>> Index: gcc/tree-ssa-ccp.c
>>>> ===================================================================
>>>> --- gcc/tree-ssa-ccp.c (revision 173734)
>>>> +++ gcc/tree-ssa-ccp.c (working copy)
>>>> @@ -1727,6 +1727,13 @@ ccp_fold_stmt (gimple_stmt_iterator *gsi
>>>>  	if (gimple_call_internal_p (stmt))
>>>>  	  return false;
>>>>
>>>> +        /* The heuristic of fold_builtin_alloca differs before and after
>>>> +           inlining, so we don't require the arg to be changed into a constant
>>>> +           for folding, but just to be constant.  */
>>>> +        if (gimple_call_alloca_for_var_p (stmt)
>>>> +            && get_constant_value (gimple_call_arg (stmt, 0)))
>>>
>>> Probably reverse the get_constant_value check and the transformation
>>
>> Done.
>>
>>> (gimple_call_alloca_for_var_p isn't a predicate as it has side-effects,
>>> so its name should be changed).
>>>
>>>> +          return true;
>>>> +
>>>>  	/* Propagate into the call arguments.  Compared to replace_uses_in
>>>>  	   this can use the argument slot types for type verification
>>>>  	   instead of the current argument type.  We also can safely
>>>>
>>>> But, to me it feels like a hack. Do you have any ideas how to do this better?
>>>
>>> It's somewhat of a hack, but at least it is more of a defined place
>>> for this transformation - which then suggests to remove it from
>>> generic folding and only keep calling it from CCP this way.
>>>
>>
>> Done.
>>
> 
> This is an updated version of the patch. I have 2 new patches and an updated
> testcase which I will sent out individually.
> 
> Patch set was bootstrapped and reg-tested on x86_64.
> 
> Ok for trunk?
> 

Ping. http://gcc.gnu.org/ml/gcc-patches/2011-07/msg02730.html and
http://gcc.gnu.org/ml/gcc-patches/2011-07/msg02733.html ok for trunk?

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PING][PATCH PR43513] Replace vla with array
  2011-08-07 12:00                     ` Tom de Vries
@ 2011-08-17 12:46                       ` Tom de Vries
  0 siblings, 0 replies; 37+ messages in thread
From: Tom de Vries @ 2011-08-17 12:46 UTC (permalink / raw)
  To: Richard Guenther; +Cc: gcc-patches, Zdenek Dvorak

Hi Richard,

Ping. http://gcc.gnu.org/ml/gcc-patches/2011-07/msg02730.html and
http://gcc.gnu.org/ml/gcc-patches/2011-07/msg02733.html ok for trunk?

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PING][PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-30  9:01                   ` Tom de Vries
                                       ` (2 preceding siblings ...)
  2011-08-07 12:00                     ` Tom de Vries
@ 2011-08-24  9:31                     ` Tom de Vries
  2011-08-30 12:29                       ` Richard Guenther
  2011-08-31 15:57                     ` [PATCH " H.J. Lu
  2011-09-24 19:31                     ` Eric Botcazou
  5 siblings, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-08-24  9:31 UTC (permalink / raw)
  To: Richard Guenther; +Cc: gcc-patches, Zdenek Dvorak

Hi Richard,

On 07/30/2011 09:21 AM, Tom de Vries wrote:
> This is an updated version of the patch. I have 2 new patches and an updated
> testcase which I will sent out individually.
> 
> Patch set was bootstrapped and reg-tested on x86_64.
> 
> Ok for trunk?
> 

You already approved the the 2 new patches, and I checked them in.

Ping for the updated patch and updated testcase:
http://gcc.gnu.org/ml/gcc-patches/2011-07/msg02730.html
http://gcc.gnu.org/ml/gcc-patches/2011-07/msg02733.html

The patch replaces a vla __builtin_alloca that has a constant argument with an
array declaration.

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PING][PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-08-24  9:31                     ` [PING][PATCH PR43513, 1/3] Replace vla with array - Implementation Tom de Vries
@ 2011-08-30 12:29                       ` Richard Guenther
  0 siblings, 0 replies; 37+ messages in thread
From: Richard Guenther @ 2011-08-30 12:29 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Guenther, gcc-patches, Zdenek Dvorak

On Wed, Aug 24, 2011 at 10:47 AM, Tom de Vries <vries@codesourcery.com> wrote:
> Hi Richard,
>
> On 07/30/2011 09:21 AM, Tom de Vries wrote:
>> This is an updated version of the patch. I have 2 new patches and an updated
>> testcase which I will sent out individually.
>>
>> Patch set was bootstrapped and reg-tested on x86_64.
>>
>> Ok for trunk?
>>
>
> You already approved the the 2 new patches, and I checked them in.
>
> Ping for the updated patch and updated testcase:
> http://gcc.gnu.org/ml/gcc-patches/2011-07/msg02730.html
> http://gcc.gnu.org/ml/gcc-patches/2011-07/msg02733.html
>
> The patch replaces a vla __builtin_alloca that has a constant argument with an
> array declaration.

Ok with

+  /* Declare array.  */
+  elem_type = build_nonstandard_integer_type (BITS_PER_UNIT, 1);
+  n_elem = size * 8 / BITS_PER_UNIT;
+  align = MIN (size * 8, GET_MODE_PRECISION (word_mode));

changed to use BIGGEST_ALIGNMENT instead of GET_MODE_PRECISION (word_mode).

For the future, can you please include the testcases in the main patch
instead of sending them separately?  You're the only one separating them
and it's really not helpful.

Thanks,
Richard.

> Thanks,
> - Tom
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-07-30  9:01                   ` Tom de Vries
                                       ` (3 preceding siblings ...)
  2011-08-24  9:31                     ` [PING][PATCH PR43513, 1/3] Replace vla with array - Implementation Tom de Vries
@ 2011-08-31 15:57                     ` H.J. Lu
  2011-09-24 19:31                     ` Eric Botcazou
  5 siblings, 0 replies; 37+ messages in thread
From: H.J. Lu @ 2011-08-31 15:57 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Guenther, gcc-patches, Zdenek Dvorak

On Sat, Jul 30, 2011 at 12:21 AM, Tom de Vries <vries@codesourcery.com> wrote:
>
> This is an updated version of the patch. I have 2 new patches and an updated
> testcase which I will sent out individually.
>
> Patch set was bootstrapped and reg-tested on x86_64.
>
> Ok for trunk?
>
> Thanks,
> - Tom
>
> 2011-07-30  Tom de Vries  <tom@codesourcery.com>
>
>        PR middle-end/43513
>        * Makefile.in (tree-ssa-ccp.o): Add $(PARAMS_H) to rule.
>        * tree-ssa-ccp.c (params.h): Include.
>        (fold_builtin_alloca_for_var): New function.
>        (ccp_fold_stmt): Use fold_builtin_alloca_for_var.
>

This caused:

http://gcc.gnu.org/bugzilla/show_bug.cgi?id=50251

-- 
H.J.

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-07-30  9:01                   ` Tom de Vries
                                       ` (4 preceding siblings ...)
  2011-08-31 15:57                     ` [PATCH " H.J. Lu
@ 2011-09-24 19:31                     ` Eric Botcazou
  2011-09-25  4:47                       ` Tom de Vries
  2011-09-25 10:59                       ` Richard Guenther
  5 siblings, 2 replies; 37+ messages in thread
From: Eric Botcazou @ 2011-09-24 19:31 UTC (permalink / raw)
  To: gcc-patches; +Cc: Tom de Vries, Richard Guenther, Zdenek Dvorak

> This is an updated version of the patch. I have 2 new patches and an
> updated testcase which I will sent out individually.
>
> Patch set was bootstrapped and reg-tested on x86_64.
>
> Ok for trunk?
>
> Thanks,
> - Tom
>
> 2011-07-30  Tom de Vries  <tom@codesourcery.com>
>
> 	PR middle-end/43513
> 	* Makefile.in (tree-ssa-ccp.o): Add $(PARAMS_H) to rule.
> 	* tree-ssa-ccp.c (params.h): Include.
> 	(fold_builtin_alloca_for_var): New function.
> 	(ccp_fold_stmt): Use fold_builtin_alloca_for_var.

We have detected another fallout on some Ada code: the transformation replaces 
a call to __builtin_alloca with &var, i.e. it introduces an aliased variable, 
which invalidates the points-to information of some subsequent call, fooling 
DSE into thinking that it can eliminate a live store.

The brute force approach

Index: tree-ssa-ccp.c
===================================================================
--- tree-ssa-ccp.c      (revision 179038)
+++ tree-ssa-ccp.c      (working copy)
@@ -2014,7 +2014,10 @@ do_ssa_ccp (void)
   ccp_initialize ();
   ssa_propagate (ccp_visit_stmt, ccp_visit_phi_node);
   if (ccp_finalize ())
-    return (TODO_cleanup_cfg | TODO_update_ssa | TODO_remove_unused_locals);
+    return (TODO_cleanup_cfg
+           | TODO_update_ssa
+           | TODO_rebuild_alias
+           | TODO_remove_unused_locals);
   else
     return 0;
 }

works, but we might want to be move clever.  Thoughts?

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-09-24 19:31                     ` Eric Botcazou
@ 2011-09-25  4:47                       ` Tom de Vries
  2011-09-25 10:21                         ` Eric Botcazou
  2011-09-25 10:59                       ` Richard Guenther
  1 sibling, 1 reply; 37+ messages in thread
From: Tom de Vries @ 2011-09-25  4:47 UTC (permalink / raw)
  To: Eric Botcazou; +Cc: gcc-patches, Tom de Vries, Richard Guenther, Zdenek Dvorak

[-- Attachment #1: Type: text/plain, Size: 1658 bytes --]

On 09/24/2011 05:29 PM, Eric Botcazou wrote:
>> This is an updated version of the patch. I have 2 new patches and an
>> updated testcase which I will sent out individually.
>>
>> Patch set was bootstrapped and reg-tested on x86_64.
>>
>> Ok for trunk?
>>
>> Thanks,
>> - Tom
>>
>> 2011-07-30  Tom de Vries  <tom@codesourcery.com>
>>
>> 	PR middle-end/43513
>> 	* Makefile.in (tree-ssa-ccp.o): Add $(PARAMS_H) to rule.
>> 	* tree-ssa-ccp.c (params.h): Include.
>> 	(fold_builtin_alloca_for_var): New function.
>> 	(ccp_fold_stmt): Use fold_builtin_alloca_for_var.
> 
> We have detected another fallout on some Ada code: the transformation replaces 
> a call to __builtin_alloca with &var, i.e. it introduces an aliased variable, 
> which invalidates the points-to information of some subsequent call, fooling 
> DSE into thinking that it can eliminate a live store.
> 
> The brute force approach
> 
> Index: tree-ssa-ccp.c
> ===================================================================
> --- tree-ssa-ccp.c      (revision 179038)
> +++ tree-ssa-ccp.c      (working copy)
> @@ -2014,7 +2014,10 @@ do_ssa_ccp (void)
>    ccp_initialize ();
>    ssa_propagate (ccp_visit_stmt, ccp_visit_phi_node);
>    if (ccp_finalize ())
> -    return (TODO_cleanup_cfg | TODO_update_ssa | TODO_remove_unused_locals);
> +    return (TODO_cleanup_cfg
> +           | TODO_update_ssa
> +           | TODO_rebuild_alias
> +           | TODO_remove_unused_locals);
>    else
>      return 0;
>  }
> 
> works, but we might want to be move clever.  Thoughts?
> 

How about attached (untested) patch implementing a conservative, but
runtime-efficient approach?

Thanks,
- Tom

[-- Attachment #2: pr43513-alias.patch --]
[-- Type: text/x-patch, Size: 1685 bytes --]

Index: gcc/tree-ssa-ccp.c
===================================================================
--- gcc/tree-ssa-ccp.c (revision 179043)
+++ gcc/tree-ssa-ccp.c (working copy)
@@ -1729,6 +1729,7 @@ fold_builtin_alloca_for_var (gimple stmt
   array_type = build_array_type_nelts (elem_type, n_elem);
   var = create_tmp_var (array_type, NULL);
   DECL_ALIGN (var) = align;
+  pt_solution_add_var (&get_ptr_info (lhs)->pt, var);
 
   /* Fold alloca to the address of the array.  */
   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
Index: gcc/tree-ssa-alias.h
===================================================================
--- gcc/tree-ssa-alias.h (revision 179043)
+++ gcc/tree-ssa-alias.h (working copy)
@@ -125,6 +125,7 @@ extern void dump_alias_stats (FILE *);
 
 /* In tree-ssa-structalias.c  */
 extern unsigned int compute_may_aliases (void);
+extern void pt_solution_add_var (struct pt_solution *, tree);
 extern bool pt_solution_empty_p (struct pt_solution *);
 extern bool pt_solution_includes_global (struct pt_solution *);
 extern bool pt_solution_includes (struct pt_solution *, const_tree);
Index: gcc/tree-ssa-structalias.c
===================================================================
--- gcc/tree-ssa-structalias.c (revision 179043)
+++ gcc/tree-ssa-structalias.c (working copy)
@@ -5952,6 +5952,14 @@ pt_solution_ior_into (struct pt_solution
   bitmap_ior_into (dest->vars, src->vars);
 }
 
+void
+pt_solution_add_var (struct pt_solution *dest, tree var)
+{
+  struct pt_solution var_pt;
+  pt_solution_set_var (&var_pt, var);
+  pt_solution_ior_into (dest, &var_pt);
+}
+
 /* Return true if the points-to solution *PT is empty.  */
 
 bool

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513,  1/3] Replace vla with array - Implementation.
  2011-09-25  4:47                       ` Tom de Vries
@ 2011-09-25 10:21                         ` Eric Botcazou
  0 siblings, 0 replies; 37+ messages in thread
From: Eric Botcazou @ 2011-09-25 10:21 UTC (permalink / raw)
  To: Tom de Vries; +Cc: gcc-patches, Tom de Vries, Richard Guenther, Zdenek Dvorak

> How about attached (untested) patch implementing a conservative, but
> runtime-efficient approach?

This doesn't work.  My understanding is that you need to recompute far more 
than that, in particular the points-to information for _all_ the calls in the 
function.  I don't know enough of the machinery to tell whether this can be 
done incrementally, but I wouldn't think so, so passing TODO_rebuild_alias 
might be the only solution (in a less brutal way of course).

I'll post a testcase, but it depends on some other, not yet installed changes.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-09-24 19:31                     ` Eric Botcazou
  2011-09-25  4:47                       ` Tom de Vries
@ 2011-09-25 10:59                       ` Richard Guenther
  2011-09-25 11:42                         ` Tom de Vries
  2011-09-26 11:00                         ` Eric Botcazou
  1 sibling, 2 replies; 37+ messages in thread
From: Richard Guenther @ 2011-09-25 10:59 UTC (permalink / raw)
  To: Eric Botcazou; +Cc: gcc-patches, Tom de Vries, Richard Guenther, Zdenek Dvorak

On Sat, Sep 24, 2011 at 5:29 PM, Eric Botcazou <ebotcazou@adacore.com> wrote:
>> This is an updated version of the patch. I have 2 new patches and an
>> updated testcase which I will sent out individually.
>>
>> Patch set was bootstrapped and reg-tested on x86_64.
>>
>> Ok for trunk?
>>
>> Thanks,
>> - Tom
>>
>> 2011-07-30  Tom de Vries  <tom@codesourcery.com>
>>
>>       PR middle-end/43513
>>       * Makefile.in (tree-ssa-ccp.o): Add $(PARAMS_H) to rule.
>>       * tree-ssa-ccp.c (params.h): Include.
>>       (fold_builtin_alloca_for_var): New function.
>>       (ccp_fold_stmt): Use fold_builtin_alloca_for_var.
>
> We have detected another fallout on some Ada code: the transformation replaces
> a call to __builtin_alloca with &var, i.e. it introduces an aliased variable,
> which invalidates the points-to information of some subsequent call, fooling
> DSE into thinking that it can eliminate a live store.

Ugh, yeah.  I suppose PTA assigned a HEAP var as pointed-to object for the
original pointer, even if the transformed stmt

 orig_ptr_1 = &a;

has the points-to information preserved for orig_ptr_1 further propagation of
&a will make accesses through orig_ptr_1 have different alias properties.

What should work in this special case of a singleton points-to set of orig_ptr_1
(might want to check that) is, do

  SET_DECL_PT_UID (decl-of-a, DECL_UID (pointed-to orig_ptr_1));

The brute force approach is not acceptable (it'll wreck IPA points-to info).

A helper like pt_solution_singleton_p (struct pt_solution *pt, unsigned *uid)
whould be nice to have for this.

Note that we don't have points-to information computed during the first
CCP pass, so the above should be conditional on SSA_NAME_PTR_INFO
being present and not ! ->anything (but then assert that we actually do have
a singleton, or fail the folding).

Richard.

> The brute force approach
>
> Index: tree-ssa-ccp.c
> ===================================================================
> --- tree-ssa-ccp.c      (revision 179038)
> +++ tree-ssa-ccp.c      (working copy)
> @@ -2014,7 +2014,10 @@ do_ssa_ccp (void)
>   ccp_initialize ();
>   ssa_propagate (ccp_visit_stmt, ccp_visit_phi_node);
>   if (ccp_finalize ())
> -    return (TODO_cleanup_cfg | TODO_update_ssa | TODO_remove_unused_locals);
> +    return (TODO_cleanup_cfg
> +           | TODO_update_ssa
> +           | TODO_rebuild_alias
> +           | TODO_remove_unused_locals);
>   else
>     return 0;
>  }
>
> works, but we might want to be move clever.  Thoughts?
>
> --
> Eric Botcazou
>

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-09-25 10:59                       ` Richard Guenther
@ 2011-09-25 11:42                         ` Tom de Vries
  2011-09-26 11:03                           ` Eric Botcazou
  2011-09-26 11:59                           ` Richard Guenther
  2011-09-26 11:00                         ` Eric Botcazou
  1 sibling, 2 replies; 37+ messages in thread
From: Tom de Vries @ 2011-09-25 11:42 UTC (permalink / raw)
  To: Richard Guenther
  Cc: Eric Botcazou, gcc-patches, Tom de Vries, Richard Guenther,
	Zdenek Dvorak

[-- Attachment #1: Type: text/plain, Size: 2807 bytes --]

On 09/25/2011 10:57 AM, Richard Guenther wrote:
> On Sat, Sep 24, 2011 at 5:29 PM, Eric Botcazou <ebotcazou@adacore.com> wrote:
>>> This is an updated version of the patch. I have 2 new patches and an
>>> updated testcase which I will sent out individually.
>>>
>>> Patch set was bootstrapped and reg-tested on x86_64.
>>>
>>> Ok for trunk?
>>>
>>> Thanks,
>>> - Tom
>>>
>>> 2011-07-30  Tom de Vries  <tom@codesourcery.com>
>>>
>>>       PR middle-end/43513
>>>       * Makefile.in (tree-ssa-ccp.o): Add $(PARAMS_H) to rule.
>>>       * tree-ssa-ccp.c (params.h): Include.
>>>       (fold_builtin_alloca_for_var): New function.
>>>       (ccp_fold_stmt): Use fold_builtin_alloca_for_var.
>>
>> We have detected another fallout on some Ada code: the transformation replaces
>> a call to __builtin_alloca with &var, i.e. it introduces an aliased variable,
>> which invalidates the points-to information of some subsequent call, fooling
>> DSE into thinking that it can eliminate a live store.
> 
> Ugh, yeah.  I suppose PTA assigned a HEAP var as pointed-to object for the
> original pointer, even if the transformed stmt
> 
>  orig_ptr_1 = &a;
> 
> has the points-to information preserved for orig_ptr_1 further propagation of
> &a will make accesses through orig_ptr_1 have different alias properties.
> 
> What should work in this special case of a singleton points-to set of orig_ptr_1
> (might want to check that) is, do
> 
>   SET_DECL_PT_UID (decl-of-a, DECL_UID (pointed-to orig_ptr_1));
> 
> The brute force approach is not acceptable (it'll wreck IPA points-to info).
> 
> A helper like pt_solution_singleton_p (struct pt_solution *pt, unsigned *uid)
> whould be nice to have for this.
> 
> Note that we don't have points-to information computed during the first
> CCP pass, so the above should be conditional on SSA_NAME_PTR_INFO
> being present and not ! ->anything (but then assert that we actually do have
> a singleton, or fail the folding).
> 

I tried to implement the approach you describe above in attached patch.
Currently testing on x86_64.

Thanks,
- Tom

> Richard.
> 
>> The brute force approach
>>
>> Index: tree-ssa-ccp.c
>> ===================================================================
>> --- tree-ssa-ccp.c      (revision 179038)
>> +++ tree-ssa-ccp.c      (working copy)
>> @@ -2014,7 +2014,10 @@ do_ssa_ccp (void)
>>   ccp_initialize ();
>>   ssa_propagate (ccp_visit_stmt, ccp_visit_phi_node);
>>   if (ccp_finalize ())
>> -    return (TODO_cleanup_cfg | TODO_update_ssa | TODO_remove_unused_locals);
>> +    return (TODO_cleanup_cfg
>> +           | TODO_update_ssa
>> +           | TODO_rebuild_alias
>> +           | TODO_remove_unused_locals);
>>   else
>>     return 0;
>>  }
>>
>> works, but we might want to be move clever.  Thoughts?
>>
>> --
>> Eric Botcazou
>>


[-- Attachment #2: pr43513-alias.3.patch --]
[-- Type: text/x-patch, Size: 2231 bytes --]

Index: gcc/tree-ssa-ccp.c
===================================================================
--- gcc/tree-ssa-ccp.c (revision 179043)
+++ gcc/tree-ssa-ccp.c (working copy)
@@ -1729,6 +1729,17 @@ fold_builtin_alloca_for_var (gimple stmt
   array_type = build_array_type_nelts (elem_type, n_elem);
   var = create_tmp_var (array_type, NULL);
   DECL_ALIGN (var) = align;
+  {
+    struct ptr_info_def *pi = SSA_NAME_PTR_INFO (lhs);
+    if (pi != NULL && !pi->pt.anything)
+      {
+	bool singleton_p;
+	unsigned uid;
+	singleton_p = pt_solution_singleton_p (&pi->pt, &uid);
+	gcc_assert (singleton_p);
+	SET_DECL_PT_UID (var, uid);
+      }
+  }
 
   /* Fold alloca to the address of the array.  */
   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
Index: gcc/tree-ssa-alias.h
===================================================================
--- gcc/tree-ssa-alias.h (revision 179043)
+++ gcc/tree-ssa-alias.h (working copy)
@@ -126,6 +126,7 @@ extern void dump_alias_stats (FILE *);
 /* In tree-ssa-structalias.c  */
 extern unsigned int compute_may_aliases (void);
 extern bool pt_solution_empty_p (struct pt_solution *);
+extern bool pt_solution_singleton_p (struct pt_solution *, unsigned *);
 extern bool pt_solution_includes_global (struct pt_solution *);
 extern bool pt_solution_includes (struct pt_solution *, const_tree);
 extern bool pt_solutions_intersect (struct pt_solution *, struct pt_solution *);
Index: gcc/tree-ssa-structalias.c
===================================================================
--- gcc/tree-ssa-structalias.c (revision 179043)
+++ gcc/tree-ssa-structalias.c (working copy)
@@ -5978,6 +5978,21 @@ pt_solution_empty_p (struct pt_solution
   return true;
 }
 
+/* Return true if the points-to solution *PT only point to a single var, and
+   return the var uid in *UID.  */
+
+bool
+pt_solution_singleton_p (struct pt_solution *pt, unsigned *uid)
+{
+  if (pt->anything || pt->nonlocal || pt->escaped || pt->ipa_escaped
+      || pt->null || pt->vars == NULL
+      || !bitmap_single_bit_set_p (pt->vars))
+    return false;
+
+  *uid = bitmap_first_set_bit (pt->vars);
+  return true;
+}
+
 /* Return true if the points-to solution *PT includes global memory.  */
 
 bool

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-09-25 10:59                       ` Richard Guenther
  2011-09-25 11:42                         ` Tom de Vries
@ 2011-09-26 11:00                         ` Eric Botcazou
  1 sibling, 0 replies; 37+ messages in thread
From: Eric Botcazou @ 2011-09-26 11:00 UTC (permalink / raw)
  To: Richard Guenther
  Cc: gcc-patches, Tom de Vries, Richard Guenther, Zdenek Dvorak

[-- Attachment #1: Type: text/plain, Size: 946 bytes --]

> Ugh, yeah.  I suppose PTA assigned a HEAP var as pointed-to object for the
> original pointer, even if the transformed stmt
>
>  orig_ptr_1 = &a;
>
> has the points-to information preserved for orig_ptr_1 further propagation
> of &a will make accesses through orig_ptr_1 have different alias
> properties.

AFAICS it's an escaping problem: the new variable isn't seen as escaping at 
some call point.  Testcase attached, compile opt22.adb at -O but make sure 
your tree is up-to-date.

> What should work in this special case of a singleton points-to set of
> orig_ptr_1 (might want to check that) is, do
>
>   SET_DECL_PT_UID (decl-of-a, DECL_UID (pointed-to orig_ptr_1));
>
> The brute force approach is not acceptable (it'll wreck IPA points-to
> info).

OK, thanks for the tip.


2011-09-26  Eric Botcazou  <ebotcazou@adacore.com>

        * gnat.dg/opt22.adb: New test.
        * gnat.dg/opt22_pkg.ad[sb]: New helper.



-- 
Eric Botcazou

[-- Attachment #2: opt22.adb --]
[-- Type: text/x-adasrc, Size: 301 bytes --]

-- { dg-do run }
-- { dg-options "-O" }

with Opt22_Pkg; use Opt22_Pkg;

procedure Opt22 is

   procedure Go (S : String) is
   begin
      begin
        Fail;
      exception
        when Constraint_Error => Put ("the " & S);
      end;
      Put ("the " & S);
   end;

begin
   Go ("message");
end;

[-- Attachment #3: opt22_pkg.ads --]
[-- Type: text/x-adasrc, Size: 89 bytes --]

package Opt22_Pkg is

   procedure Fail;

   procedure Put (S : String);

end Opt22_Pkg;

[-- Attachment #4: opt22_pkg.adb --]
[-- Type: text/x-adasrc, Size: 239 bytes --]

package body Opt22_Pkg is

   procedure Fail is
   begin
      raise Constraint_Error;
   end;

   procedure Put (S : String) is
   begin
      if S /= "the message" then
         raise Program_Error;
      end if;
   end;

end Opt22_Pkg;

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-09-25 11:42                         ` Tom de Vries
@ 2011-09-26 11:03                           ` Eric Botcazou
  2011-09-26 11:59                           ` Richard Guenther
  1 sibling, 0 replies; 37+ messages in thread
From: Eric Botcazou @ 2011-09-26 11:03 UTC (permalink / raw)
  To: Tom de Vries
  Cc: Richard Guenther, gcc-patches, Tom de Vries, Richard Guenther,
	Zdenek Dvorak

> I tried to implement the approach you describe above in attached patch.

Thanks a lot, this indeed fixes the problem!

> Currently testing on x86_64.

Please also install the testcase I posted in the other message in conjunction 
with the fix.  Thanks in advance.

-- 
Eric Botcazou

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-09-25 11:42                         ` Tom de Vries
  2011-09-26 11:03                           ` Eric Botcazou
@ 2011-09-26 11:59                           ` Richard Guenther
  2011-09-26 13:58                             ` Tom de Vries
  1 sibling, 1 reply; 37+ messages in thread
From: Richard Guenther @ 2011-09-26 11:59 UTC (permalink / raw)
  To: Tom de Vries
  Cc: Richard Guenther, Eric Botcazou, gcc-patches, Tom de Vries,
	Zdenek Dvorak

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3248 bytes --]

On Sun, 25 Sep 2011, Tom de Vries wrote:

> On 09/25/2011 10:57 AM, Richard Guenther wrote:
> > On Sat, Sep 24, 2011 at 5:29 PM, Eric Botcazou <ebotcazou@adacore.com> wrote:
> >>> This is an updated version of the patch. I have 2 new patches and an
> >>> updated testcase which I will sent out individually.
> >>>
> >>> Patch set was bootstrapped and reg-tested on x86_64.
> >>>
> >>> Ok for trunk?
> >>>
> >>> Thanks,
> >>> - Tom
> >>>
> >>> 2011-07-30  Tom de Vries  <tom@codesourcery.com>
> >>>
> >>>       PR middle-end/43513
> >>>       * Makefile.in (tree-ssa-ccp.o): Add $(PARAMS_H) to rule.
> >>>       * tree-ssa-ccp.c (params.h): Include.
> >>>       (fold_builtin_alloca_for_var): New function.
> >>>       (ccp_fold_stmt): Use fold_builtin_alloca_for_var.
> >>
> >> We have detected another fallout on some Ada code: the transformation replaces
> >> a call to __builtin_alloca with &var, i.e. it introduces an aliased variable,
> >> which invalidates the points-to information of some subsequent call, fooling
> >> DSE into thinking that it can eliminate a live store.
> > 
> > Ugh, yeah.  I suppose PTA assigned a HEAP var as pointed-to object for the
> > original pointer, even if the transformed stmt
> > 
> >  orig_ptr_1 = &a;
> > 
> > has the points-to information preserved for orig_ptr_1 further propagation of
> > &a will make accesses through orig_ptr_1 have different alias properties.
> > 
> > What should work in this special case of a singleton points-to set of orig_ptr_1
> > (might want to check that) is, do
> > 
> >   SET_DECL_PT_UID (decl-of-a, DECL_UID (pointed-to orig_ptr_1));
> > 
> > The brute force approach is not acceptable (it'll wreck IPA points-to info).
> > 
> > A helper like pt_solution_singleton_p (struct pt_solution *pt, unsigned *uid)
> > whould be nice to have for this.
> > 
> > Note that we don't have points-to information computed during the first
> > CCP pass, so the above should be conditional on SSA_NAME_PTR_INFO
> > being present and not ! ->anything (but then assert that we actually do have
> > a singleton, or fail the folding).
> > 
> 
> I tried to implement the approach you describe above in attached patch.
> Currently testing on x86_64.

Looks good to me (with a changelog entry).

Thanks,
Richard.

> Thanks,
> - Tom
> 
> > Richard.
> > 
> >> The brute force approach
> >>
> >> Index: tree-ssa-ccp.c
> >> ===================================================================
> >> --- tree-ssa-ccp.c      (revision 179038)
> >> +++ tree-ssa-ccp.c      (working copy)
> >> @@ -2014,7 +2014,10 @@ do_ssa_ccp (void)
> >>   ccp_initialize ();
> >>   ssa_propagate (ccp_visit_stmt, ccp_visit_phi_node);
> >>   if (ccp_finalize ())
> >> -    return (TODO_cleanup_cfg | TODO_update_ssa | TODO_remove_unused_locals);
> >> +    return (TODO_cleanup_cfg
> >> +           | TODO_update_ssa
> >> +           | TODO_rebuild_alias
> >> +           | TODO_remove_unused_locals);
> >>   else
> >>     return 0;
> >>  }
> >>
> >> works, but we might want to be move clever.  Thoughts?
> >>
> >> --
> >> Eric Botcazou
> >>
> 
> 

-- 
Richard Guenther <rguenther@suse.de>
SUSE / SUSE Labs
SUSE LINUX Products GmbH - Nuernberg - AG Nuernberg - HRB 16746
GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer

^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH PR43513, 1/3] Replace vla with array - Implementation.
  2011-09-26 11:59                           ` Richard Guenther
@ 2011-09-26 13:58                             ` Tom de Vries
  0 siblings, 0 replies; 37+ messages in thread
From: Tom de Vries @ 2011-09-26 13:58 UTC (permalink / raw)
  To: Richard Guenther
  Cc: Richard Guenther, Eric Botcazou, gcc-patches, Tom de Vries,
	Zdenek Dvorak

[-- Attachment #1: Type: text/plain, Size: 3685 bytes --]

On 09/26/2011 12:29 PM, Richard Guenther wrote:
> On Sun, 25 Sep 2011, Tom de Vries wrote:
> 
>> On 09/25/2011 10:57 AM, Richard Guenther wrote:
>>> On Sat, Sep 24, 2011 at 5:29 PM, Eric Botcazou <ebotcazou@adacore.com> wrote:
>>>>> This is an updated version of the patch. I have 2 new patches and an
>>>>> updated testcase which I will sent out individually.
>>>>>
>>>>> Patch set was bootstrapped and reg-tested on x86_64.
>>>>>
>>>>> Ok for trunk?
>>>>>
>>>>> Thanks,
>>>>> - Tom
>>>>>
>>>>> 2011-07-30  Tom de Vries  <tom@codesourcery.com>
>>>>>
>>>>>       PR middle-end/43513
>>>>>       * Makefile.in (tree-ssa-ccp.o): Add $(PARAMS_H) to rule.
>>>>>       * tree-ssa-ccp.c (params.h): Include.
>>>>>       (fold_builtin_alloca_for_var): New function.
>>>>>       (ccp_fold_stmt): Use fold_builtin_alloca_for_var.
>>>>
>>>> We have detected another fallout on some Ada code: the transformation replaces
>>>> a call to __builtin_alloca with &var, i.e. it introduces an aliased variable,
>>>> which invalidates the points-to information of some subsequent call, fooling
>>>> DSE into thinking that it can eliminate a live store.
>>>
>>> Ugh, yeah.  I suppose PTA assigned a HEAP var as pointed-to object for the
>>> original pointer, even if the transformed stmt
>>>
>>>  orig_ptr_1 = &a;
>>>
>>> has the points-to information preserved for orig_ptr_1 further propagation of
>>> &a will make accesses through orig_ptr_1 have different alias properties.
>>>
>>> What should work in this special case of a singleton points-to set of orig_ptr_1
>>> (might want to check that) is, do
>>>
>>>   SET_DECL_PT_UID (decl-of-a, DECL_UID (pointed-to orig_ptr_1));
>>>
>>> The brute force approach is not acceptable (it'll wreck IPA points-to info).
>>>
>>> A helper like pt_solution_singleton_p (struct pt_solution *pt, unsigned *uid)
>>> whould be nice to have for this.
>>>
>>> Note that we don't have points-to information computed during the first
>>> CCP pass, so the above should be conditional on SSA_NAME_PTR_INFO
>>> being present and not ! ->anything (but then assert that we actually do have
>>> a singleton, or fail the folding).
>>>
>>
>> I tried to implement the approach you describe above in attached patch.
>> Currently testing on x86_64.
> 
> Looks good to me (with a changelog entry).
> 

Bootstrapped and regtested on x86_64, for ada and default languages, no failures
apart from the PR50433 failure, which was due to my sources being a week old.

Installed patch with ChangeLog below (reposting patch for convenience), and
installed gnat.dg/opt22* testcase.

Thanks.
- Tom

> Thanks,
> Richard.
> 
>> Thanks,
>> - Tom
>>
>>> Richard.
>>>
>>>> The brute force approach
>>>>
>>>> Index: tree-ssa-ccp.c
>>>> ===================================================================
>>>> --- tree-ssa-ccp.c      (revision 179038)
>>>> +++ tree-ssa-ccp.c      (working copy)
>>>> @@ -2014,7 +2014,10 @@ do_ssa_ccp (void)
>>>>   ccp_initialize ();
>>>>   ssa_propagate (ccp_visit_stmt, ccp_visit_phi_node);
>>>>   if (ccp_finalize ())
>>>> -    return (TODO_cleanup_cfg | TODO_update_ssa | TODO_remove_unused_locals);
>>>> +    return (TODO_cleanup_cfg
>>>> +           | TODO_update_ssa
>>>> +           | TODO_rebuild_alias
>>>> +           | TODO_remove_unused_locals);
>>>>   else
>>>>     return 0;
>>>>  }
>>>>
>>>> works, but we might want to be move clever.  Thoughts?
>>>>
>>>> --
>>>> Eric Botcazou
>>>>
>>
>>
> 

2011-09-26  Tom de Vries  <tom@codesourcery.com>

	* tree-ssa-alias.h (pt_solution_singleton_p): Declare.
	* tree-ssa-structalias.c (pt_solution_singleton_p): New function.
	* tree-ssa-ccp.c (fold_builtin_alloca_for_var): Set points-to solution
	of new var.

[-- Attachment #2: pr43513-alias.3.patch --]
[-- Type: text/x-patch, Size: 2231 bytes --]

Index: gcc/tree-ssa-ccp.c
===================================================================
--- gcc/tree-ssa-ccp.c (revision 179043)
+++ gcc/tree-ssa-ccp.c (working copy)
@@ -1729,6 +1729,17 @@ fold_builtin_alloca_for_var (gimple stmt
   array_type = build_array_type_nelts (elem_type, n_elem);
   var = create_tmp_var (array_type, NULL);
   DECL_ALIGN (var) = align;
+  {
+    struct ptr_info_def *pi = SSA_NAME_PTR_INFO (lhs);
+    if (pi != NULL && !pi->pt.anything)
+      {
+	bool singleton_p;
+	unsigned uid;
+	singleton_p = pt_solution_singleton_p (&pi->pt, &uid);
+	gcc_assert (singleton_p);
+	SET_DECL_PT_UID (var, uid);
+      }
+  }
 
   /* Fold alloca to the address of the array.  */
   return fold_convert (TREE_TYPE (lhs), build_fold_addr_expr (var));
Index: gcc/tree-ssa-alias.h
===================================================================
--- gcc/tree-ssa-alias.h (revision 179043)
+++ gcc/tree-ssa-alias.h (working copy)
@@ -126,6 +126,7 @@ extern void dump_alias_stats (FILE *);
 /* In tree-ssa-structalias.c  */
 extern unsigned int compute_may_aliases (void);
 extern bool pt_solution_empty_p (struct pt_solution *);
+extern bool pt_solution_singleton_p (struct pt_solution *, unsigned *);
 extern bool pt_solution_includes_global (struct pt_solution *);
 extern bool pt_solution_includes (struct pt_solution *, const_tree);
 extern bool pt_solutions_intersect (struct pt_solution *, struct pt_solution *);
Index: gcc/tree-ssa-structalias.c
===================================================================
--- gcc/tree-ssa-structalias.c (revision 179043)
+++ gcc/tree-ssa-structalias.c (working copy)
@@ -5978,6 +5978,21 @@ pt_solution_empty_p (struct pt_solution
   return true;
 }
 
+/* Return true if the points-to solution *PT only point to a single var, and
+   return the var uid in *UID.  */
+
+bool
+pt_solution_singleton_p (struct pt_solution *pt, unsigned *uid)
+{
+  if (pt->anything || pt->nonlocal || pt->escaped || pt->ipa_escaped
+      || pt->null || pt->vars == NULL
+      || !bitmap_single_bit_set_p (pt->vars))
+    return false;
+
+  *uid = bitmap_first_set_bit (pt->vars);
+  return true;
+}
+
 /* Return true if the points-to solution *PT includes global memory.  */
 
 bool

^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2011-09-26 12:51 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-07-27 10:58 [PATCH, PR43513] Replace vla with array Tom de Vries
2011-07-27 10:59 ` [PATCH PR43513, 1/3] Replace vla with array - Implementation Tom de Vries
2011-07-27 11:40   ` Richard Guenther
2011-07-27 14:37     ` Tom de Vries
2011-07-27 14:52       ` Richard Guenther
2011-07-27 16:12         ` Michael Matz
2011-07-28  9:22           ` Richard Guenther
2011-07-27 17:46         ` Tom de Vries
2011-07-28  9:44           ` Richard Guenther
2011-07-29 12:09             ` Tom de Vries
2011-07-28 16:00               ` Richard Guenther
2011-07-28 18:09                 ` Tom de Vries
2011-07-28 19:38                   ` Richard Guenther
2011-07-30  9:01                   ` Tom de Vries
2011-07-30  9:23                     ` Tom de Vries
2011-07-30 14:22                       ` Richard Guenther
2011-07-30  9:27                     ` Tom de Vries
2011-07-30 13:50                       ` Richard Guenther
2011-08-07 12:00                     ` Tom de Vries
2011-08-17 12:46                       ` [PING][PATCH PR43513] Replace vla with array Tom de Vries
2011-08-24  9:31                     ` [PING][PATCH PR43513, 1/3] Replace vla with array - Implementation Tom de Vries
2011-08-30 12:29                       ` Richard Guenther
2011-08-31 15:57                     ` [PATCH " H.J. Lu
2011-09-24 19:31                     ` Eric Botcazou
2011-09-25  4:47                       ` Tom de Vries
2011-09-25 10:21                         ` Eric Botcazou
2011-09-25 10:59                       ` Richard Guenther
2011-09-25 11:42                         ` Tom de Vries
2011-09-26 11:03                           ` Eric Botcazou
2011-09-26 11:59                           ` Richard Guenther
2011-09-26 13:58                             ` Tom de Vries
2011-09-26 11:00                         ` Eric Botcazou
2011-07-27 11:00 ` [PATCH PR43513, 2/3] Replace vla with array - Test case Tom de Vries
2011-07-27 11:12   ` Tom de Vries
2011-07-30 10:39     ` Tom de Vries
2011-07-27 11:21 ` [PATCH PR43513, 3/3] Replace vla with array - Adapt mudflap testcase Tom de Vries
2011-07-27 11:51 ` [PATCH, PR43513] Replace vla with array Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).