public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
@ 2015-11-30  9:00 Tom de Vries
  2015-11-30  9:36 ` Richard Biener
  0 siblings, 1 reply; 44+ messages in thread
From: Tom de Vries @ 2015-11-30  9:00 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 374 bytes --]

Hi,

this patch fixes PR46032.

It handles a call:
...
   __builtin_GOMP_parallel (fn, data, num_threads, flags)
...
as:
...
   fn (data)
...
in ipa-pta.

This improves ipa-pta alias analysis in the parallelized function fn, 
and allows vectorization in the testcase without a runtime alias test.

Bootstrapped and reg-tested on x86_64.

OK for stage3 trunk?

Thanks,
- Tom

[-- Attachment #2: 0001-Handle-BUILT_IN_GOMP_PARALLEL-in-pta.patch --]
[-- Type: text/x-patch, Size: 6855 bytes --]

Handle BUILT_IN_GOMP_PARALLEL in pta

2015-11-30  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/46032
	* tree-ssa-structalias.c (find_func_aliases_for_builtin_call)
	(find_func_clobbers): Handle BUILT_IN_GOMP_PARALLEL.
	(ipa_pta_execute): Same.  Handle node->parallelized_function as a local
	function.

	* gcc.dg/pr46032.c: New test.

	* testsuite/libgomp.c/pr46032.c: New test.

---
 gcc/testsuite/gcc.dg/pr46032.c        | 47 ++++++++++++++++++++++
 gcc/tree-ssa-structalias.c            | 73 ++++++++++++++++++++++++++++++++++-
 libgomp/testsuite/libgomp.c/pr46032.c | 44 +++++++++++++++++++++
 3 files changed, 162 insertions(+), 2 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr46032.c b/gcc/testsuite/gcc.dg/pr46032.c
new file mode 100644
index 0000000..b91190e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr46032.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fopenmp -ftree-vectorize -std=c99 -fipa-pta -fdump-tree-vect-all" } */
+
+extern void abort (void);
+
+#define nEvents 1000
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+init (unsigned *results, unsigned *pData)
+{
+  unsigned int i;
+  for (i = 0; i < nEvents; ++i)
+    pData[i] = i % 3;
+}
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+check (unsigned *results)
+{
+  unsigned sum = 0;
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    sum += results[idx];
+
+  if (sum != 1998)
+    abort ();
+}
+
+int
+main (void)
+{
+  unsigned results[nEvents];
+  unsigned pData[nEvents];
+  unsigned coeff = 2;
+
+  init (&results[0], &pData[0]);
+
+#pragma omp parallel for
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    results[idx] = coeff * pData[idx];
+
+  check (&results[0]);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "note: vectorized 1 loop" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "versioning for alias required" "vect" } } */
+
diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index f24ebeb..3fe538b 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -4488,6 +4488,39 @@ find_func_aliases_for_builtin_call (struct function *fn, gcall *t)
 	    }
 	  return true;
 	}
+      case BUILT_IN_GOMP_PARALLEL:
+	{
+	  /* Handle
+	       __builtin_GOMP_parallel (fn, data, num_threads, flags).  */
+	  if (in_ipa_mode)
+	    {
+	      tree fnarg = gimple_call_arg (t, 0);
+	      gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
+	      tree fndecl = TREE_OPERAND (fnarg, 0);
+	      varinfo_t fi = get_vi_for_tree (fndecl);
+	      tree arg = gimple_call_arg (t, 1);
+	      gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
+
+	      /* Assign the passed argument to the appropriate incoming
+		 parameter of the function.  */
+	      struct constraint_expr lhs ;
+	      lhs = get_function_part_constraint (fi, fi_parm_base + 0);
+	      auto_vec<ce_s, 2> rhsc;
+	      struct constraint_expr *rhsp;
+	      get_constraint_for_rhs (arg, &rhsc);
+	      while (rhsc.length () != 0)
+		{
+		  rhsp = &rhsc.last ();
+		  process_constraint (new_constraint (lhs, *rhsp));
+		  rhsc.pop ();
+		}
+
+	      return true;
+	    }
+	  /* Else fallthru to generic handling which will let
+	     the frame escape.  */
+	  break;
+	}
       /* printf-style functions may have hooks to set pointers to
 	 point to somewhere into the generated string.  Leave them
 	 for a later exercise...  */
@@ -5036,6 +5069,37 @@ find_func_clobbers (struct function *fn, gimple *origt)
 	  case BUILT_IN_VA_START:
 	  case BUILT_IN_VA_END:
 	    return;
+	  case BUILT_IN_GOMP_PARALLEL:
+	    {
+	      /* Handle
+		   __builtin_GOMP_parallel (fn, data, num_threads, flags).  */
+	      tree fnarg = gimple_call_arg (t, 0);
+	      gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
+	      tree fndecl = TREE_OPERAND (fnarg, 0);
+	      varinfo_t cfi = get_vi_for_tree (fndecl);
+	      tree arg = gimple_call_arg (t, 1);
+	      gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
+
+	      /* Parameter passed by value is used.  */
+	      lhs = get_function_part_constraint (fi, fi_uses);
+	      struct constraint_expr *rhsp;
+	      get_constraint_for_address_of (arg, &rhsc);
+	      FOR_EACH_VEC_ELT (rhsc, j, rhsp)
+		process_constraint (new_constraint (lhs, *rhsp));
+	      rhsc.truncate (0);
+
+	      /* The caller clobbers what the callee does.  */
+	      lhs = get_function_part_constraint (fi, fi_clobbers);
+	      rhs = get_function_part_constraint (cfi, fi_clobbers);
+	      process_constraint (new_constraint (lhs, rhs));
+
+	      /* The caller uses what the callee does.  */
+	      lhs = get_function_part_constraint (fi, fi_uses);
+	      rhs = get_function_part_constraint (cfi, fi_uses);
+	      process_constraint (new_constraint (lhs, rhs));
+
+	      return;
+	    }
 	  /* printf-style functions may have hooks to set pointers to
 	     point to somewhere into the generated string.  Leave them
 	     for a later exercise...  */
@@ -7352,7 +7416,8 @@ ipa_pta_execute (void)
       bool nonlocal_p = (node->used_from_other_partition
 			 || node->externally_visible
 			 || node->force_output
-			 || node->address_taken);
+			 || (node->address_taken
+			     && !node->parallelized_function));
 
       vi = create_function_info_for (node->decl,
 				     alias_get_name (node->decl), false,
@@ -7504,7 +7569,11 @@ ipa_pta_execute (void)
 		continue;
 
 	      /* Handle direct calls to functions with body.  */
-	      decl = gimple_call_fndecl (stmt);
+	      if (gimple_call_builtin_p (stmt, BUILT_IN_GOMP_PARALLEL))
+		decl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
+	      else
+		decl = gimple_call_fndecl (stmt);
+
 	      if (decl
 		  && (fi = lookup_vi_for_tree (decl))
 		  && fi->is_fn_info)
diff --git a/libgomp/testsuite/libgomp.c/pr46032.c b/libgomp/testsuite/libgomp.c/pr46032.c
new file mode 100644
index 0000000..2178aa7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/pr46032.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -std=c99 -fipa-pta" } */
+
+
+extern void abort (void);
+
+#define nEvents 1000
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+init (unsigned *results, unsigned *pData)
+{
+  unsigned int i;
+  for (i = 0; i < nEvents; ++i)
+    pData[i] = i % 3;
+}
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+check (unsigned *results)
+{
+  unsigned sum = 0;
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    sum += results[idx];
+
+  if (sum != 1998)
+    abort ();
+}
+
+int
+main (void)
+{
+  unsigned results[nEvents];
+  unsigned pData[nEvents];
+  unsigned coeff = 2;
+
+  init (&results[0], &pData[0]);
+
+#pragma omp parallel for
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    results[idx] = coeff * pData[idx];
+
+  check (&results[0]);
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-11-30  9:00 [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta Tom de Vries
@ 2015-11-30  9:36 ` Richard Biener
  2015-11-30 12:20   ` Tom de Vries
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Biener @ 2015-11-30  9:36 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Jakub Jelinek, gcc-patches

On Mon, 30 Nov 2015, Tom de Vries wrote:

> Hi,
> 
> this patch fixes PR46032.
> 
> It handles a call:
> ...
>   __builtin_GOMP_parallel (fn, data, num_threads, flags)
> ...
> as:
> ...
>   fn (data)
> ...
> in ipa-pta.
> 
> This improves ipa-pta alias analysis in the parallelized function fn, and
> allows vectorization in the testcase without a runtime alias test.
> 
> Bootstrapped and reg-tested on x86_64.
> 
> OK for stage3 trunk?

+             /* Assign the passed argument to the appropriate incoming
+                parameter of the function.  */
+             struct constraint_expr lhs ;
+             lhs = get_function_part_constraint (fi, fi_parm_base + 0);
+             auto_vec<ce_s, 2> rhsc;
+             struct constraint_expr *rhsp;
+             get_constraint_for_rhs (arg, &rhsc);
+             while (rhsc.length () != 0)
+               {
+                 rhsp = &rhsc.last ();
+                 process_constraint (new_constraint (lhs, *rhsp));
+                 rhsc.pop ();
+               }

please use style used elsewhere with

             FOR_EACH_VEC_ELT (rhsc, j, rhsp)
               process_constraint (new_constraint (lhs, *rhsp));
             rhsc.truncate (0);

+             /* Parameter passed by value is used.  */
+             lhs = get_function_part_constraint (fi, fi_uses);
+             struct constraint_expr *rhsp;
+             get_constraint_for_address_of (arg, &rhsc);

This isn't correct - you want to use get_constraint_for (arg, &rhsc).
After all rhs is already an ADDR_EXPR.

+             FOR_EACH_VEC_ELT (rhsc, j, rhsp)
+               process_constraint (new_constraint (lhs, *rhsp));
+             rhsc.truncate (0);
+
+             /* The caller clobbers what the callee does.  */
+             lhs = get_function_part_constraint (fi, fi_clobbers);
+             rhs = get_function_part_constraint (cfi, fi_clobbers);
+             process_constraint (new_constraint (lhs, rhs));
+
+             /* The caller uses what the callee does.  */
+             lhs = get_function_part_constraint (fi, fi_uses);
+             rhs = get_function_part_constraint (cfi, fi_uses);
+             process_constraint (new_constraint (lhs, rhs));

I don't see why you need those.  The solver should compute these
in even better precision (context sensitive on the call side).

The same is true for the function parameter.  That is, the only
needed part of the patch should be that making sure we see
the "direct" call and assign parameters correctly.

Richard.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-11-30  9:36 ` Richard Biener
@ 2015-11-30 12:20   ` Tom de Vries
  2015-11-30 13:32     ` Richard Biener
  2015-12-09 10:01     ` [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in find_func_clobbers Tom de Vries
  0 siblings, 2 replies; 44+ messages in thread
From: Tom de Vries @ 2015-11-30 12:20 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3067 bytes --]

On 30/11/15 10:16, Richard Biener wrote:
> On Mon, 30 Nov 2015, Tom de Vries wrote:
>
>> Hi,
>>
>> this patch fixes PR46032.
>>
>> It handles a call:
>> ...
>>    __builtin_GOMP_parallel (fn, data, num_threads, flags)
>> ...
>> as:
>> ...
>>    fn (data)
>> ...
>> in ipa-pta.
>>
>> This improves ipa-pta alias analysis in the parallelized function fn, and
>> allows vectorization in the testcase without a runtime alias test.
>>
>> Bootstrapped and reg-tested on x86_64.
>>
>> OK for stage3 trunk?
>
> +             /* Assign the passed argument to the appropriate incoming
> +                parameter of the function.  */
> +             struct constraint_expr lhs ;
> +             lhs = get_function_part_constraint (fi, fi_parm_base + 0);
> +             auto_vec<ce_s, 2> rhsc;
> +             struct constraint_expr *rhsp;
> +             get_constraint_for_rhs (arg, &rhsc);
> +             while (rhsc.length () != 0)
> +               {
> +                 rhsp = &rhsc.last ();
> +                 process_constraint (new_constraint (lhs, *rhsp));
> +                 rhsc.pop ();
> +               }
>
> please use style used elsewhere with
>
>               FOR_EACH_VEC_ELT (rhsc, j, rhsp)
>                 process_constraint (new_constraint (lhs, *rhsp));
>               rhsc.truncate (0);
>

That code was copied from find_func_aliases_for_call.
I've factored out the bit that I copied as 
find_func_aliases_for_call_arg, and fixed the style there (and dropped 
'rhsc.truncate (0)' since AFAIU it's redundant at the end of a function).

> +             /* Parameter passed by value is used.  */
> +             lhs = get_function_part_constraint (fi, fi_uses);
> +             struct constraint_expr *rhsp;
> +             get_constraint_for_address_of (arg, &rhsc);
>
> This isn't correct - you want to use get_constraint_for (arg, &rhsc).
> After all rhs is already an ADDR_EXPR.
>

Can we add an assert somewhere to detect this incorrect usage?

> +             FOR_EACH_VEC_ELT (rhsc, j, rhsp)
> +               process_constraint (new_constraint (lhs, *rhsp));
> +             rhsc.truncate (0);
> +
> +             /* The caller clobbers what the callee does.  */
> +             lhs = get_function_part_constraint (fi, fi_clobbers);
> +             rhs = get_function_part_constraint (cfi, fi_clobbers);
> +             process_constraint (new_constraint (lhs, rhs));
> +
> +             /* The caller uses what the callee does.  */
> +             lhs = get_function_part_constraint (fi, fi_uses);
> +             rhs = get_function_part_constraint (cfi, fi_uses);
> +             process_constraint (new_constraint (lhs, rhs));
>
> I don't see why you need those.  The solver should compute these
> in even better precision (context sensitive on the call side).
>
> The same is true for the function parameter.  That is, the only
> needed part of the patch should be that making sure we see
> the "direct" call and assign parameters correctly.
>

Dropped this bit.

OK for stage3 trunk if bootstrap and reg-test succeeds?

Thanks,
- Tom



[-- Attachment #2: 0001-Handle-BUILT_IN_GOMP_PARALLEL-in-ipa-pta.patch --]
[-- Type: text/x-patch, Size: 6743 bytes --]

Handle BUILT_IN_GOMP_PARALLEL in ipa-pta

2015-11-30  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/46032
	* tree-ssa-structalias.c (find_func_aliases_for_call_arg): New function,
	factored out of ...
	(find_func_aliases_for_call): ... here.
	(find_func_aliases_for_builtin_call, find_func_clobbers): Handle
	BUILT_IN_GOMP_PARALLEL.
	(ipa_pta_execute): Same.  Handle node->parallelized_function as a local
	function.

	* gcc.dg/pr46032.c: New test.

	* testsuite/libgomp.c/pr46032.c: New test.

---
 gcc/testsuite/gcc.dg/pr46032.c        | 47 +++++++++++++++++++++++++++
 gcc/tree-ssa-structalias.c            | 60 +++++++++++++++++++++++++++--------
 libgomp/testsuite/libgomp.c/pr46032.c | 44 +++++++++++++++++++++++++
 3 files changed, 138 insertions(+), 13 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr46032.c b/gcc/testsuite/gcc.dg/pr46032.c
new file mode 100644
index 0000000..b91190e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr46032.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fopenmp -ftree-vectorize -std=c99 -fipa-pta -fdump-tree-vect-all" } */
+
+extern void abort (void);
+
+#define nEvents 1000
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+init (unsigned *results, unsigned *pData)
+{
+  unsigned int i;
+  for (i = 0; i < nEvents; ++i)
+    pData[i] = i % 3;
+}
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+check (unsigned *results)
+{
+  unsigned sum = 0;
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    sum += results[idx];
+
+  if (sum != 1998)
+    abort ();
+}
+
+int
+main (void)
+{
+  unsigned results[nEvents];
+  unsigned pData[nEvents];
+  unsigned coeff = 2;
+
+  init (&results[0], &pData[0]);
+
+#pragma omp parallel for
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    results[idx] = coeff * pData[idx];
+
+  check (&results[0]);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "note: vectorized 1 loop" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "versioning for alias required" "vect" } } */
+
diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index f24ebeb..23f79b4 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -4139,6 +4139,24 @@ get_fi_for_callee (gcall *call)
   return get_vi_for_tree (fn);
 }
 
+/* Create constraints for assigning call argument ARG to the incoming parameter
+   INDEX of function FI.  */
+
+static void
+find_func_aliases_for_call_arg (varinfo_t fi, unsigned index, tree arg)
+{
+  struct constraint_expr lhs;
+  lhs = get_function_part_constraint (fi, fi_parm_base + index);
+
+  auto_vec<ce_s, 2> rhsc;
+  get_constraint_for_rhs (arg, &rhsc);
+
+  unsigned j;
+  struct constraint_expr *rhsp;
+  FOR_EACH_VEC_ELT (rhsc, j, rhsp)
+    process_constraint (new_constraint (lhs, *rhsp));
+}
+
 /* Create constraints for the builtin call T.  Return true if the call
    was handled, otherwise false.  */
 
@@ -4488,6 +4506,25 @@ find_func_aliases_for_builtin_call (struct function *fn, gcall *t)
 	    }
 	  return true;
 	}
+      case BUILT_IN_GOMP_PARALLEL:
+	{
+	  /* Handle __builtin_GOMP_parallel (fn, data, num_threads, flags) as
+	     fn (data).  */
+	  if (in_ipa_mode)
+	    {
+	      tree fnarg = gimple_call_arg (t, 0);
+	      gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
+	      tree fndecl = TREE_OPERAND (fnarg, 0);
+	      tree arg = gimple_call_arg (t, 1);
+	      gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
+
+	      varinfo_t fi = get_vi_for_tree (fndecl);
+	      find_func_aliases_for_call_arg (fi, 0, arg);
+	      return true;
+	    }
+	  /* Else fallthru to generic call handling.  */
+	  break;
+	}
       /* printf-style functions may have hooks to set pointers to
 	 point to somewhere into the generated string.  Leave them
 	 for a later exercise...  */
@@ -4546,18 +4583,8 @@ find_func_aliases_for_call (struct function *fn, gcall *t)
 	 parameters of the function.  */
       for (j = 0; j < gimple_call_num_args (t); j++)
 	{
-	  struct constraint_expr lhs ;
-	  struct constraint_expr *rhsp;
 	  tree arg = gimple_call_arg (t, j);
-
-	  get_constraint_for_rhs (arg, &rhsc);
-	  lhs = get_function_part_constraint (fi, fi_parm_base + j);
-	  while (rhsc.length () != 0)
-	    {
-	      rhsp = &rhsc.last ();
-	      process_constraint (new_constraint (lhs, *rhsp));
-	      rhsc.pop ();
-	    }
+	  find_func_aliases_for_call_arg (fi, j, arg);
 	}
 
       /* If we are returning a value, assign it to the result.  */
@@ -5036,6 +5063,8 @@ find_func_clobbers (struct function *fn, gimple *origt)
 	  case BUILT_IN_VA_START:
 	  case BUILT_IN_VA_END:
 	    return;
+	  case BUILT_IN_GOMP_PARALLEL:
+	    return;
 	  /* printf-style functions may have hooks to set pointers to
 	     point to somewhere into the generated string.  Leave them
 	     for a later exercise...  */
@@ -7352,7 +7381,8 @@ ipa_pta_execute (void)
       bool nonlocal_p = (node->used_from_other_partition
 			 || node->externally_visible
 			 || node->force_output
-			 || node->address_taken);
+			 || (node->address_taken
+			     && !node->parallelized_function));
 
       vi = create_function_info_for (node->decl,
 				     alias_get_name (node->decl), false,
@@ -7504,7 +7534,11 @@ ipa_pta_execute (void)
 		continue;
 
 	      /* Handle direct calls to functions with body.  */
-	      decl = gimple_call_fndecl (stmt);
+	      if (gimple_call_builtin_p (stmt, BUILT_IN_GOMP_PARALLEL))
+		decl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
+	      else
+		decl = gimple_call_fndecl (stmt);
+
 	      if (decl
 		  && (fi = lookup_vi_for_tree (decl))
 		  && fi->is_fn_info)
diff --git a/libgomp/testsuite/libgomp.c/pr46032.c b/libgomp/testsuite/libgomp.c/pr46032.c
new file mode 100644
index 0000000..2178aa7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/pr46032.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -std=c99 -fipa-pta" } */
+
+
+extern void abort (void);
+
+#define nEvents 1000
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+init (unsigned *results, unsigned *pData)
+{
+  unsigned int i;
+  for (i = 0; i < nEvents; ++i)
+    pData[i] = i % 3;
+}
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+check (unsigned *results)
+{
+  unsigned sum = 0;
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    sum += results[idx];
+
+  if (sum != 1998)
+    abort ();
+}
+
+int
+main (void)
+{
+  unsigned results[nEvents];
+  unsigned pData[nEvents];
+  unsigned coeff = 2;
+
+  init (&results[0], &pData[0]);
+
+#pragma omp parallel for
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    results[idx] = coeff * pData[idx];
+
+  check (&results[0]);
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-11-30 12:20   ` Tom de Vries
@ 2015-11-30 13:32     ` Richard Biener
  2015-11-30 13:35       ` Jakub Jelinek
  2015-11-30 16:44       ` Tom de Vries
  2015-12-09 10:01     ` [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in find_func_clobbers Tom de Vries
  1 sibling, 2 replies; 44+ messages in thread
From: Richard Biener @ 2015-11-30 13:32 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Jakub Jelinek, gcc-patches

On Mon, 30 Nov 2015, Tom de Vries wrote:

> On 30/11/15 10:16, Richard Biener wrote:
> > On Mon, 30 Nov 2015, Tom de Vries wrote:
> > 
> > > Hi,
> > > 
> > > this patch fixes PR46032.
> > > 
> > > It handles a call:
> > > ...
> > >    __builtin_GOMP_parallel (fn, data, num_threads, flags)
> > > ...
> > > as:
> > > ...
> > >    fn (data)
> > > ...
> > > in ipa-pta.
> > > 
> > > This improves ipa-pta alias analysis in the parallelized function fn, and
> > > allows vectorization in the testcase without a runtime alias test.
> > > 
> > > Bootstrapped and reg-tested on x86_64.
> > > 
> > > OK for stage3 trunk?
> > 
> > +             /* Assign the passed argument to the appropriate incoming
> > +                parameter of the function.  */
> > +             struct constraint_expr lhs ;
> > +             lhs = get_function_part_constraint (fi, fi_parm_base + 0);
> > +             auto_vec<ce_s, 2> rhsc;
> > +             struct constraint_expr *rhsp;
> > +             get_constraint_for_rhs (arg, &rhsc);
> > +             while (rhsc.length () != 0)
> > +               {
> > +                 rhsp = &rhsc.last ();
> > +                 process_constraint (new_constraint (lhs, *rhsp));
> > +                 rhsc.pop ();
> > +               }
> > 
> > please use style used elsewhere with
> > 
> >               FOR_EACH_VEC_ELT (rhsc, j, rhsp)
> >                 process_constraint (new_constraint (lhs, *rhsp));
> >               rhsc.truncate (0);
> > 
> 
> That code was copied from find_func_aliases_for_call.
> I've factored out the bit that I copied as find_func_aliases_for_call_arg, and
> fixed the style there (and dropped 'rhsc.truncate (0)' since AFAIU it's
> redundant at the end of a function).
> 
> > +             /* Parameter passed by value is used.  */
> > +             lhs = get_function_part_constraint (fi, fi_uses);
> > +             struct constraint_expr *rhsp;
> > +             get_constraint_for_address_of (arg, &rhsc);
> > 
> > This isn't correct - you want to use get_constraint_for (arg, &rhsc).
> > After all rhs is already an ADDR_EXPR.
> > 
> 
> Can we add an assert somewhere to detect this incorrect usage?
> 
> > +             FOR_EACH_VEC_ELT (rhsc, j, rhsp)
> > +               process_constraint (new_constraint (lhs, *rhsp));
> > +             rhsc.truncate (0);
> > +
> > +             /* The caller clobbers what the callee does.  */
> > +             lhs = get_function_part_constraint (fi, fi_clobbers);
> > +             rhs = get_function_part_constraint (cfi, fi_clobbers);
> > +             process_constraint (new_constraint (lhs, rhs));
> > +
> > +             /* The caller uses what the callee does.  */
> > +             lhs = get_function_part_constraint (fi, fi_uses);
> > +             rhs = get_function_part_constraint (cfi, fi_uses);
> > +             process_constraint (new_constraint (lhs, rhs));
> > 
> > I don't see why you need those.  The solver should compute these
> > in even better precision (context sensitive on the call side).
> > 
> > The same is true for the function parameter.  That is, the only
> > needed part of the patch should be that making sure we see
> > the "direct" call and assign parameters correctly.
> > 
> 
> Dropped this bit.
> 
> OK for stage3 trunk if bootstrap and reg-test succeeds?

-                        || node->address_taken);
+                        || (node->address_taken
+                            && !node->parallelized_function));

please add a comment here on why this is safe.

Ok with this change.

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-11-30 13:32     ` Richard Biener
@ 2015-11-30 13:35       ` Jakub Jelinek
  2015-12-03 11:49         ` Tom de Vries
  2015-11-30 16:44       ` Tom de Vries
  1 sibling, 1 reply; 44+ messages in thread
From: Jakub Jelinek @ 2015-11-30 13:35 UTC (permalink / raw)
  To: Richard Biener; +Cc: Tom de Vries, gcc-patches

On Mon, Nov 30, 2015 at 02:24:18PM +0100, Richard Biener wrote:
> > OK for stage3 trunk if bootstrap and reg-test succeeds?
> 
> -                        || node->address_taken);
> +                        || (node->address_taken
> +                            && !node->parallelized_function));
> 
> please add a comment here on why this is safe.
> 
> Ok with this change.

BTW, __builting_GOMP_task supposedly can be treated similarly
if the third argument is NULL (if 3rd arg is non-NULL, then
the caller passes a different structure from what the callee receives,
but perhaps it could be emulated as pretending that cpyfn is called first
with address of a temporary var and the data argument and then fn
is called with the address of the temporary var).

	Jakub

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-11-30 13:32     ` Richard Biener
  2015-11-30 13:35       ` Jakub Jelinek
@ 2015-11-30 16:44       ` Tom de Vries
  2015-11-30 16:52         ` Jakub Jelinek
  2015-12-01 14:27         ` [PATCH] Handle BUILT_IN_GOACC_PARALLEL " Tom de Vries
  1 sibling, 2 replies; 44+ messages in thread
From: Tom de Vries @ 2015-11-30 16:44 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3629 bytes --]

On 30/11/15 14:24, Richard Biener wrote:
> On Mon, 30 Nov 2015, Tom de Vries wrote:
>
>> On 30/11/15 10:16, Richard Biener wrote:
>>> On Mon, 30 Nov 2015, Tom de Vries wrote:
>>>
>>>> Hi,
>>>>
>>>> this patch fixes PR46032.
>>>>
>>>> It handles a call:
>>>> ...
>>>>     __builtin_GOMP_parallel (fn, data, num_threads, flags)
>>>> ...
>>>> as:
>>>> ...
>>>>     fn (data)
>>>> ...
>>>> in ipa-pta.
>>>>
>>>> This improves ipa-pta alias analysis in the parallelized function fn, and
>>>> allows vectorization in the testcase without a runtime alias test.
>>>>
>>>> Bootstrapped and reg-tested on x86_64.
>>>>
>>>> OK for stage3 trunk?
>>>
>>> +             /* Assign the passed argument to the appropriate incoming
>>> +                parameter of the function.  */
>>> +             struct constraint_expr lhs ;
>>> +             lhs = get_function_part_constraint (fi, fi_parm_base + 0);
>>> +             auto_vec<ce_s, 2> rhsc;
>>> +             struct constraint_expr *rhsp;
>>> +             get_constraint_for_rhs (arg, &rhsc);
>>> +             while (rhsc.length () != 0)
>>> +               {
>>> +                 rhsp = &rhsc.last ();
>>> +                 process_constraint (new_constraint (lhs, *rhsp));
>>> +                 rhsc.pop ();
>>> +               }
>>>
>>> please use style used elsewhere with
>>>
>>>                FOR_EACH_VEC_ELT (rhsc, j, rhsp)
>>>                  process_constraint (new_constraint (lhs, *rhsp));
>>>                rhsc.truncate (0);
>>>
>>
>> That code was copied from find_func_aliases_for_call.
>> I've factored out the bit that I copied as find_func_aliases_for_call_arg, and
>> fixed the style there (and dropped 'rhsc.truncate (0)' since AFAIU it's
>> redundant at the end of a function).
>>
>>> +             /* Parameter passed by value is used.  */
>>> +             lhs = get_function_part_constraint (fi, fi_uses);
>>> +             struct constraint_expr *rhsp;
>>> +             get_constraint_for_address_of (arg, &rhsc);
>>>
>>> This isn't correct - you want to use get_constraint_for (arg, &rhsc).
>>> After all rhs is already an ADDR_EXPR.
>>>
>>
>> Can we add an assert somewhere to detect this incorrect usage?
>>
>>> +             FOR_EACH_VEC_ELT (rhsc, j, rhsp)
>>> +               process_constraint (new_constraint (lhs, *rhsp));
>>> +             rhsc.truncate (0);
>>> +
>>> +             /* The caller clobbers what the callee does.  */
>>> +             lhs = get_function_part_constraint (fi, fi_clobbers);
>>> +             rhs = get_function_part_constraint (cfi, fi_clobbers);
>>> +             process_constraint (new_constraint (lhs, rhs));
>>> +
>>> +             /* The caller uses what the callee does.  */
>>> +             lhs = get_function_part_constraint (fi, fi_uses);
>>> +             rhs = get_function_part_constraint (cfi, fi_uses);
>>> +             process_constraint (new_constraint (lhs, rhs));
>>>
>>> I don't see why you need those.  The solver should compute these
>>> in even better precision (context sensitive on the call side).
>>>
>>> The same is true for the function parameter.  That is, the only
>>> needed part of the patch should be that making sure we see
>>> the "direct" call and assign parameters correctly.
>>>
>>
>> Dropped this bit.
>>
>> OK for stage3 trunk if bootstrap and reg-test succeeds?
>
> -                        || node->address_taken);
> +                        || (node->address_taken
> +                            && !node->parallelized_function));
>
> please add a comment here on why this is safe.
>
> Ok with this change.

Updated with comment, committed as attached.

Thanks,
- Tom



[-- Attachment #2: 0001-Handle-BUILT_IN_GOMP_PARALLEL-in-ipa-pta.patch --]
[-- Type: text/x-patch, Size: 7669 bytes --]

Handle BUILT_IN_GOMP_PARALLEL in ipa-pta

2015-11-30  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/46032
	* tree-ssa-structalias.c (find_func_aliases_for_call_arg): New function,
	factored out of ...
	(find_func_aliases_for_call): ... here.
	(find_func_aliases_for_builtin_call, find_func_clobbers): Handle
	BUILT_IN_GOMP_PARALLEL.
	(ipa_pta_execute): Same.  Handle node->parallelized_function as a local
	function.

	* gcc.dg/pr46032.c: New test.

	* testsuite/libgomp.c/pr46032.c: New test.

---
 gcc/testsuite/gcc.dg/pr46032.c        | 47 +++++++++++++++++++++++
 gcc/tree-ssa-structalias.c            | 71 ++++++++++++++++++++++++++++-------
 libgomp/testsuite/libgomp.c/pr46032.c | 44 ++++++++++++++++++++++
 3 files changed, 149 insertions(+), 13 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/pr46032.c b/gcc/testsuite/gcc.dg/pr46032.c
new file mode 100644
index 0000000..b91190e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr46032.c
@@ -0,0 +1,47 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fopenmp -ftree-vectorize -std=c99 -fipa-pta -fdump-tree-vect-all" } */
+
+extern void abort (void);
+
+#define nEvents 1000
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+init (unsigned *results, unsigned *pData)
+{
+  unsigned int i;
+  for (i = 0; i < nEvents; ++i)
+    pData[i] = i % 3;
+}
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+check (unsigned *results)
+{
+  unsigned sum = 0;
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    sum += results[idx];
+
+  if (sum != 1998)
+    abort ();
+}
+
+int
+main (void)
+{
+  unsigned results[nEvents];
+  unsigned pData[nEvents];
+  unsigned coeff = 2;
+
+  init (&results[0], &pData[0]);
+
+#pragma omp parallel for
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    results[idx] = coeff * pData[idx];
+
+  check (&results[0]);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "note: vectorized 1 loop" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "versioning for alias required" "vect" } } */
+
diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index f24ebeb..7f4a8ad 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -4139,6 +4139,24 @@ get_fi_for_callee (gcall *call)
   return get_vi_for_tree (fn);
 }
 
+/* Create constraints for assigning call argument ARG to the incoming parameter
+   INDEX of function FI.  */
+
+static void
+find_func_aliases_for_call_arg (varinfo_t fi, unsigned index, tree arg)
+{
+  struct constraint_expr lhs;
+  lhs = get_function_part_constraint (fi, fi_parm_base + index);
+
+  auto_vec<ce_s, 2> rhsc;
+  get_constraint_for_rhs (arg, &rhsc);
+
+  unsigned j;
+  struct constraint_expr *rhsp;
+  FOR_EACH_VEC_ELT (rhsc, j, rhsp)
+    process_constraint (new_constraint (lhs, *rhsp));
+}
+
 /* Create constraints for the builtin call T.  Return true if the call
    was handled, otherwise false.  */
 
@@ -4488,6 +4506,25 @@ find_func_aliases_for_builtin_call (struct function *fn, gcall *t)
 	    }
 	  return true;
 	}
+      case BUILT_IN_GOMP_PARALLEL:
+	{
+	  /* Handle __builtin_GOMP_parallel (fn, data, num_threads, flags) as
+	     fn (data).  */
+	  if (in_ipa_mode)
+	    {
+	      tree fnarg = gimple_call_arg (t, 0);
+	      gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
+	      tree fndecl = TREE_OPERAND (fnarg, 0);
+	      tree arg = gimple_call_arg (t, 1);
+	      gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
+
+	      varinfo_t fi = get_vi_for_tree (fndecl);
+	      find_func_aliases_for_call_arg (fi, 0, arg);
+	      return true;
+	    }
+	  /* Else fallthru to generic call handling.  */
+	  break;
+	}
       /* printf-style functions may have hooks to set pointers to
 	 point to somewhere into the generated string.  Leave them
 	 for a later exercise...  */
@@ -4546,18 +4583,8 @@ find_func_aliases_for_call (struct function *fn, gcall *t)
 	 parameters of the function.  */
       for (j = 0; j < gimple_call_num_args (t); j++)
 	{
-	  struct constraint_expr lhs ;
-	  struct constraint_expr *rhsp;
 	  tree arg = gimple_call_arg (t, j);
-
-	  get_constraint_for_rhs (arg, &rhsc);
-	  lhs = get_function_part_constraint (fi, fi_parm_base + j);
-	  while (rhsc.length () != 0)
-	    {
-	      rhsp = &rhsc.last ();
-	      process_constraint (new_constraint (lhs, *rhsp));
-	      rhsc.pop ();
-	    }
+	  find_func_aliases_for_call_arg (fi, j, arg);
 	}
 
       /* If we are returning a value, assign it to the result.  */
@@ -5036,6 +5063,8 @@ find_func_clobbers (struct function *fn, gimple *origt)
 	  case BUILT_IN_VA_START:
 	  case BUILT_IN_VA_END:
 	    return;
+	  case BUILT_IN_GOMP_PARALLEL:
+	    return;
 	  /* printf-style functions may have hooks to set pointers to
 	     point to somewhere into the generated string.  Leave them
 	     for a later exercise...  */
@@ -7345,6 +7374,18 @@ ipa_pta_execute (void)
 
       gcc_assert (!node->clone_of);
 
+      /* When parallelizing a code region, we split the region off into a
+	 separate function, to be run by several threads in parallel.  So for a
+	 function foo, we split off a region into a function
+	 foo._0 (void *foodata), and replace the region with some variant of a
+	 function call run_on_threads (&foo._0, data).  The '&foo._0' sets the
+	 address_taken bit for function foo._0, which would make it non-local.
+	 But for the purpose of ipa-pta, we can regard the run_on_threads call
+	 as a local call foo._0 (data),  so we ignore address_taken on nodes
+	 with parallelized_function set.  */
+      bool node_address_taken = (node->address_taken
+				 && !node->parallelized_function);
+
       /* For externally visible or attribute used annotated functions use
 	 local constraints for their arguments.
 	 For local functions we see all callers and thus do not need initial
@@ -7352,7 +7393,7 @@ ipa_pta_execute (void)
       bool nonlocal_p = (node->used_from_other_partition
 			 || node->externally_visible
 			 || node->force_output
-			 || node->address_taken);
+			 || node_address_taken);
 
       vi = create_function_info_for (node->decl,
 				     alias_get_name (node->decl), false,
@@ -7504,7 +7545,11 @@ ipa_pta_execute (void)
 		continue;
 
 	      /* Handle direct calls to functions with body.  */
-	      decl = gimple_call_fndecl (stmt);
+	      if (gimple_call_builtin_p (stmt, BUILT_IN_GOMP_PARALLEL))
+		decl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
+	      else
+		decl = gimple_call_fndecl (stmt);
+
 	      if (decl
 		  && (fi = lookup_vi_for_tree (decl))
 		  && fi->is_fn_info)
diff --git a/libgomp/testsuite/libgomp.c/pr46032.c b/libgomp/testsuite/libgomp.c/pr46032.c
new file mode 100644
index 0000000..2178aa7
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/pr46032.c
@@ -0,0 +1,44 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-vectorize -std=c99 -fipa-pta" } */
+
+
+extern void abort (void);
+
+#define nEvents 1000
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+init (unsigned *results, unsigned *pData)
+{
+  unsigned int i;
+  for (i = 0; i < nEvents; ++i)
+    pData[i] = i % 3;
+}
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+check (unsigned *results)
+{
+  unsigned sum = 0;
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    sum += results[idx];
+
+  if (sum != 1998)
+    abort ();
+}
+
+int
+main (void)
+{
+  unsigned results[nEvents];
+  unsigned pData[nEvents];
+  unsigned coeff = 2;
+
+  init (&results[0], &pData[0]);
+
+#pragma omp parallel for
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    results[idx] = coeff * pData[idx];
+
+  check (&results[0]);
+
+  return 0;
+}

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-11-30 16:44       ` Tom de Vries
@ 2015-11-30 16:52         ` Jakub Jelinek
  2015-11-30 18:00           ` Tom de Vries
  2015-12-01 14:27         ` [PATCH] Handle BUILT_IN_GOACC_PARALLEL " Tom de Vries
  1 sibling, 1 reply; 44+ messages in thread
From: Jakub Jelinek @ 2015-11-30 16:52 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Biener, gcc-patches

On Mon, Nov 30, 2015 at 05:36:25PM +0100, Tom de Vries wrote:
> +int
> +main (void)
> +{
> +  unsigned results[nEvents];
> +  unsigned pData[nEvents];
> +  unsigned coeff = 2;
> +
> +  init (&results[0], &pData[0]);
> +
> +#pragma omp parallel for
> +  for (int idx = 0; idx < (int)nEvents; idx++)
> +    results[idx] = coeff * pData[idx];

Could you please add another testcase, where you have say pData
and some other pointer that init sets to alias with pData, and verify
that such loop (would need to be say normal loop inside #pragma omp single
or master) is not vectorized?

	Jakub

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-11-30 16:52         ` Jakub Jelinek
@ 2015-11-30 18:00           ` Tom de Vries
  2015-12-01 12:27             ` Christophe Lyon
  0 siblings, 1 reply; 44+ messages in thread
From: Tom de Vries @ 2015-11-30 18:00 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 866 bytes --]

On 30/11/15 17:48, Jakub Jelinek wrote:
> On Mon, Nov 30, 2015 at 05:36:25PM +0100, Tom de Vries wrote:
>> +int
>> +main (void)
>> +{
>> +  unsigned results[nEvents];
>> +  unsigned pData[nEvents];
>> +  unsigned coeff = 2;
>> +
>> +  init (&results[0], &pData[0]);
>> +
>> +#pragma omp parallel for
>> +  for (int idx = 0; idx < (int)nEvents; idx++)
>> +    results[idx] = coeff * pData[idx];
>
> Could you please add another testcase, where you have say pData
> and some other pointer that init sets to alias with pData, and verify
> that such loop (would need to be say normal loop inside #pragma omp single
> or master) is not vectorized?

I've:
- added a simpler (not vectorizer-based) version of the testcase as
   pr46032-2.c, and
- copied pr46032-2.c to pr46032-3.c and modified it such that two
   pointers are aliasing

Committed to trunk.

Thanks,
- Tom


[-- Attachment #2: 0001-Add-gcc.dg-pr46032-2-3-.c-test-cases.patch --]
[-- Type: text/x-patch, Size: 2093 bytes --]

Add gcc.dg/pr46032-{2,3}.c test-cases

2015-11-30  Tom de Vries  <tom@codesourcery.com>

	* gcc.dg/pr46032-2.c: New test.
	* gcc.dg/pr46032-3.c: New test.

---
 gcc/testsuite/gcc.dg/pr46032-2.c | 29 +++++++++++++++++++++++++++++
 gcc/testsuite/gcc.dg/pr46032-3.c | 28 ++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)

diff --git a/gcc/testsuite/gcc.dg/pr46032-2.c b/gcc/testsuite/gcc.dg/pr46032-2.c
new file mode 100644
index 0000000..e110880
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr46032-2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fopenmp -std=c99 -fipa-pta -fdump-tree-optimized" } */
+
+#define N 2
+
+int
+foo (void)
+{
+  int a[N], b[N], c[N];
+  int *ap = &a[0];
+  int *bp = &b[0];
+  int *cp = &c[0];
+
+#pragma omp parallel for
+  for (unsigned int idx = 0; idx < N; idx++)
+    {
+      ap[idx] = 1;
+      bp[idx] = 2;
+      cp[idx] = ap[idx];
+    }
+
+  return *cp;
+}
+
+/* { dg-final { scan-tree-dump-times "\\] = 1;" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = 2;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = _\[0-9\]*;" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = " 3 "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.dg/pr46032-3.c b/gcc/testsuite/gcc.dg/pr46032-3.c
new file mode 100644
index 0000000..a4af7ec
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr46032-3.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fopenmp -std=c99 -fipa-pta -fdump-tree-optimized" } */
+
+#define N 2
+
+int
+foo (void)
+{
+  int a[N], c[N];
+  int *ap = &a[0];
+  int *bp = &a[0];
+  int *cp = &c[0];
+
+#pragma omp parallel for
+  for (unsigned int idx = 0; idx < N; idx++)
+    {
+      ap[idx] = 1;
+      bp[idx] = 2;
+      cp[idx] = ap[idx];
+    }
+
+  return *cp;
+}
+
+/* { dg-final { scan-tree-dump-times "\\] = 1;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = 2;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = _\[0-9\]*;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = " 3 "optimized" } } */

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-11-30 18:00           ` Tom de Vries
@ 2015-12-01 12:27             ` Christophe Lyon
  2015-12-01 12:30               ` Jakub Jelinek
  0 siblings, 1 reply; 44+ messages in thread
From: Christophe Lyon @ 2015-12-01 12:27 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Jakub Jelinek, Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1786 bytes --]

On 30 November 2015 at 18:55, Tom de Vries <Tom_deVries@mentor.com> wrote:
> On 30/11/15 17:48, Jakub Jelinek wrote:
>>
>> On Mon, Nov 30, 2015 at 05:36:25PM +0100, Tom de Vries wrote:
>>>
>>> +int
>>> +main (void)
>>> +{
>>> +  unsigned results[nEvents];
>>> +  unsigned pData[nEvents];
>>> +  unsigned coeff = 2;
>>> +
>>> +  init (&results[0], &pData[0]);
>>> +
>>> +#pragma omp parallel for
>>> +  for (int idx = 0; idx < (int)nEvents; idx++)
>>> +    results[idx] = coeff * pData[idx];
>>
>>
>> Could you please add another testcase, where you have say pData
>> and some other pointer that init sets to alias with pData, and verify
>> that such loop (would need to be say normal loop inside #pragma omp single
>> or master) is not vectorized?
>
>
> I've:
> - added a simpler (not vectorizer-based) version of the testcase as
>   pr46032-2.c, and
> - copied pr46032-2.c to pr46032-3.c and modified it such that two
>   pointers are aliasing
>
> Committed to trunk.
>

Hi,

I've committed the attached patch as obvious: it adds
dg-require-effective-target fopenmp to these new tests
so that they are skipped e.g. on arm bare-metal targets
(using newlib).

Note that pr46032.c has some failures:
FAIL:  gcc.dg/pr46032.c scan-tree-dump-times vect "note: vectorized 1 loop" 1
on arm-none-linux-gnueabi, on arm-none-linux-gnueabihf with -mfpu=vfp*,
and on armeb-none-linux-gnueabihf

I haven't looked at the details yet; see
http://people.linaro.org/~christophe.lyon/cross-validation/gcc/trunk/231076/report-build-info.html
for more information.

Thanks,

Christophe.

2015-12-01  Christophe Lyon  <christophe.lyon@linaro.org>

    * gcc.dg/pr46032.c: Add dg-require-effective-target fopenmp.
    * gcc.dg/pr46032-2.c: Likewise.
    * gcc.dg/pr46032-3.c: Likewise.


> Thanks,
> - Tom
>

[-- Attachment #2: pr46032.patch.txt --]
[-- Type: text/plain, Size: 1212 bytes --]

Index: gcc/testsuite/gcc.dg/pr46032-2.c
===================================================================
--- gcc/testsuite/gcc.dg/pr46032-2.c	(revision 231108)
+++ gcc/testsuite/gcc.dg/pr46032-2.c	(working copy)
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target fopenmp } */
 /* { dg-options "-O2 -fopenmp -std=c99 -fipa-pta -fdump-tree-optimized" } */
 
 #define N 2
Index: gcc/testsuite/gcc.dg/pr46032-3.c
===================================================================
--- gcc/testsuite/gcc.dg/pr46032-3.c	(revision 231108)
+++ gcc/testsuite/gcc.dg/pr46032-3.c	(working copy)
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target fopenmp } */
 /* { dg-options "-O2 -fopenmp -std=c99 -fipa-pta -fdump-tree-optimized" } */
 
 #define N 2
Index: gcc/testsuite/gcc.dg/pr46032.c
===================================================================
--- gcc/testsuite/gcc.dg/pr46032.c	(revision 231108)
+++ gcc/testsuite/gcc.dg/pr46032.c	(working copy)
@@ -1,4 +1,5 @@
 /* { dg-do compile } */
+/* { dg-require-effective-target fopenmp } */
 /* { dg-options "-O2 -fopenmp -ftree-vectorize -std=c99 -fipa-pta -fdump-tree-vect-all" } */
 
 extern void abort (void);

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-12-01 12:27             ` Christophe Lyon
@ 2015-12-01 12:30               ` Jakub Jelinek
  2015-12-01 14:49                 ` Tom de Vries
  0 siblings, 1 reply; 44+ messages in thread
From: Jakub Jelinek @ 2015-12-01 12:30 UTC (permalink / raw)
  To: Christophe Lyon; +Cc: Tom de Vries, Richard Biener, gcc-patches

On Tue, Dec 01, 2015 at 01:27:32PM +0100, Christophe Lyon wrote:
> I've committed the attached patch as obvious: it adds
> dg-require-effective-target fopenmp to these new tests
> so that they are skipped e.g. on arm bare-metal targets
> (using newlib).
> 
> Note that pr46032.c has some failures:
> FAIL:  gcc.dg/pr46032.c scan-tree-dump-times vect "note: vectorized 1 loop" 1
> on arm-none-linux-gnueabi, on arm-none-linux-gnueabihf with -mfpu=vfp*,
> and on armeb-none-linux-gnueabihf
> 
> I haven't looked at the details yet; see
> http://people.linaro.org/~christophe.lyon/cross-validation/gcc/trunk/231076/report-build-info.html
> for more information.

Supposedly pr46032-{2,3}.c should go into testsuite/gcc.dg/gomp/ instead and
pr46032.c into testsuite/gcc.dg/vect/ (with the fopenmp effective target and
perhaps some other effective target conditions)?
> 2015-12-01  Christophe Lyon  <christophe.lyon@linaro.org>
> 
>     * gcc.dg/pr46032.c: Add dg-require-effective-target fopenmp.
>     * gcc.dg/pr46032-2.c: Likewise.
>     * gcc.dg/pr46032-3.c: Likewise.

	Jakub

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-11-30 16:44       ` Tom de Vries
  2015-11-30 16:52         ` Jakub Jelinek
@ 2015-12-01 14:27         ` Tom de Vries
  2015-12-01 14:38           ` Richard Biener
                             ` (2 more replies)
  1 sibling, 3 replies; 44+ messages in thread
From: Tom de Vries @ 2015-12-01 14:27 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 759 bytes --]

[ was: Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta ]

On 30/11/15 17:36, Tom de Vries wrote:
> On 30/11/15 14:24, Richard Biener wrote:
>> On Mon, 30 Nov 2015, Tom de Vries wrote:
>>
>>> On 30/11/15 10:16, Richard Biener wrote:
>>>> On Mon, 30 Nov 2015, Tom de Vries wrote:
>>>>
>>>>> Hi,
>>>>>
>>>>> this patch fixes PR46032.
>>>>>
>>>>> It handles a call:
>>>>> ...
>>>>>     __builtin_GOMP_parallel (fn, data, num_threads, flags)
>>>>> ...
>>>>> as:
>>>>> ...
>>>>>     fn (data)
>>>>> ...
>>>>> in ipa-pta.
>>>>>
>>>>> This improves ipa-pta alias analysis in the parallelized function
>>>>> fn,

This follow-up patch does the same for BUILT_IN_GOACC_PARALLEL.

Bootstrapped and reg-tested on x86_64.

OK for stage3 trunk?

Thanks,
- Tom

[-- Attachment #2: 0001-Handle-BUILT_IN_GOACC_PARALLEL-in-ipa-pta.patch --]
[-- Type: text/x-patch, Size: 8419 bytes --]

Handle BUILT_IN_GOACC_PARALLEL in ipa-pta

2015-12-01  Tom de Vries  <tom@codesourcery.com>

	* tree-ssa-structalias.c (find_func_aliases_for_builtin_call)
	(find_func_clobbers, ipa_pta_execute): Handle BUILT_IN_GOACC_PARALLEL.

	* c-c++-common/goacc/kernels-alias-ipa-pta-2.c: New test.
	* c-c++-common/goacc/kernels-alias-ipa-pta-3.c: New test.
	* c-c++-common/goacc/kernels-alias-ipa-pta.c: New test.

---
 .../c-c++-common/goacc/kernels-alias-ipa-pta-2.c   | 37 ++++++++++++++++++++++
 .../c-c++-common/goacc/kernels-alias-ipa-pta-3.c   | 36 +++++++++++++++++++++
 .../c-c++-common/goacc/kernels-alias-ipa-pta.c     | 23 ++++++++++++++
 gcc/tree-ssa-structalias.c                         | 28 +++++++++++++---
 .../kernels-alias-ipa-pta-2.c                      | 27 ++++++++++++++++
 .../kernels-alias-ipa-pta-3.c                      | 26 +++++++++++++++
 .../kernels-alias-ipa-pta.c                        | 26 +++++++++++++++
 7 files changed, 199 insertions(+), 4 deletions(-)

diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-2.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-2.c
new file mode 100644
index 0000000..f16d698
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-2.c
@@ -0,0 +1,37 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fipa-pta -fdump-tree-optimized" } */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef __SIZE_TYPE__ size_t;
+void *malloc (size_t);
+void free (void *);
+#ifdef __cplusplus
+}
+#endif
+
+#define N 2
+
+void
+foo (void)
+{
+  unsigned int *a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  unsigned int *b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  unsigned int *c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc kernels pcopyout (a[0:N], b[0:N], c[0:N])
+  {
+    a[0] = 0;
+    b[0] = 1;
+    c[0] = a[0];
+  }
+
+  free (a);
+  free (b);
+  free (c);
+}
+
+/* { dg-final { scan-tree-dump-times "(?n)= 0;$" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n)= 1;$" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n)= \\*a" 0 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-3.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-3.c
new file mode 100644
index 0000000..1eb56eb
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta-3.c
@@ -0,0 +1,36 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fipa-pta -fdump-tree-optimized" } */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef __SIZE_TYPE__ size_t;
+void *malloc (size_t);
+void free (void *);
+#ifdef __cplusplus
+}
+#endif
+
+#define N 2
+
+void
+foo (void)
+{
+  unsigned int *a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  unsigned int *b = a;
+  unsigned int *c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc kernels pcopyout (a[0:N], b[0:N], c[0:N])
+  {
+    a[0] = 0;
+    b[0] = 1;
+    c[0] = a[0];
+  }
+
+  free (a);
+  free (c);
+}
+
+/* { dg-final { scan-tree-dump-times "(?n)= 0;$" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n)= 1;$" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n)= \\*a" 1 "optimized" } } */
diff --git a/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta.c b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta.c
new file mode 100644
index 0000000..969b466
--- /dev/null
+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta.c
@@ -0,0 +1,23 @@
+/* { dg-additional-options "-O2" } */
+/* { dg-additional-options "-fipa-pta -fdump-tree-optimized" } */
+
+#define N 2
+
+void
+foo (void)
+{
+  unsigned int a[N];
+  unsigned int b[N];
+  unsigned int c[N];
+
+#pragma acc kernels pcopyout (a, b, c)
+  {
+    a[0] = 0;
+    b[0] = 1;
+    c[0] = a[0];
+  }
+}
+
+/* { dg-final { scan-tree-dump-times "(?n)= 0;$" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n)= 1;$" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "(?n)= \\*_\[0-9\]\\\[0\\\];$" 0 "optimized" } } */
diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index 7f4a8ad..060ff3e 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -4507,15 +4507,32 @@ find_func_aliases_for_builtin_call (struct function *fn, gcall *t)
 	  return true;
 	}
       case BUILT_IN_GOMP_PARALLEL:
+      case BUILT_IN_GOACC_PARALLEL:
 	{
-	  /* Handle __builtin_GOMP_parallel (fn, data, num_threads, flags) as
-	     fn (data).  */
 	  if (in_ipa_mode)
 	    {
-	      tree fnarg = gimple_call_arg (t, 0);
+	      unsigned int fnpos, argpos;
+	      switch (DECL_FUNCTION_CODE (fndecl))
+		{
+		case BUILT_IN_GOMP_PARALLEL:
+		  /* __builtin_GOMP_parallel (fn, data, num_threads, flags).  */
+		  fnpos = 0;
+		  argpos = 1;
+		  break;
+		case BUILT_IN_GOACC_PARALLEL:
+		  /* __builtin_GOACC_parallel (device, fn, mapnum, hostaddrs,
+					       sizes, kinds, ...).  */
+		  fnpos = 1;
+		  argpos = 3;
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+
+	      tree fnarg = gimple_call_arg (t, fnpos);
 	      gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
 	      tree fndecl = TREE_OPERAND (fnarg, 0);
-	      tree arg = gimple_call_arg (t, 1);
+	      tree arg = gimple_call_arg (t, argpos);
 	      gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
 
 	      varinfo_t fi = get_vi_for_tree (fndecl);
@@ -5064,6 +5081,7 @@ find_func_clobbers (struct function *fn, gimple *origt)
 	  case BUILT_IN_VA_END:
 	    return;
 	  case BUILT_IN_GOMP_PARALLEL:
+	  case BUILT_IN_GOACC_PARALLEL:
 	    return;
 	  /* printf-style functions may have hooks to set pointers to
 	     point to somewhere into the generated string.  Leave them
@@ -7547,6 +7565,8 @@ ipa_pta_execute (void)
 	      /* Handle direct calls to functions with body.  */
 	      if (gimple_call_builtin_p (stmt, BUILT_IN_GOMP_PARALLEL))
 		decl = TREE_OPERAND (gimple_call_arg (stmt, 0), 0);
+	      else if (gimple_call_builtin_p (stmt, BUILT_IN_GOACC_PARALLEL))
+		decl = TREE_OPERAND (gimple_call_arg (stmt, 1), 0);
 	      else
 		decl = gimple_call_fndecl (stmt);
 
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-2.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-2.c
new file mode 100644
index 0000000..0f323c8
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-2.c
@@ -0,0 +1,27 @@
+/* { dg-additional-options "-O2 -fipa-pta" } */
+
+#include <stdlib.h>
+
+#define N 2
+
+int
+main (void)
+{
+  unsigned int *a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  unsigned int *b = (unsigned int *)malloc (N * sizeof (unsigned int));
+  unsigned int *c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc kernels pcopyout (a[0:N], b[0:N], c[0:N])
+  {
+    a[0] = 0;
+    b[0] = 1;
+    c[0] = a[0];
+  }
+
+  if (a[0] != 0 || b[0] != 1 || c[0] != 0)
+    abort ();
+
+  free (a);
+  free (b);
+  free (c);
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-3.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-3.c
new file mode 100644
index 0000000..654e750
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta-3.c
@@ -0,0 +1,26 @@
+/* { dg-additional-options "-O2 -fipa-pta" } */
+
+#include <stdlib.h>
+
+#define N 2
+
+int
+main (void)
+{
+  unsigned int *a = (unsigned int *)malloc (N * sizeof (unsigned int));
+  unsigned int *b = a;
+  unsigned int *c = (unsigned int *)malloc (N * sizeof (unsigned int));
+
+#pragma acc kernels pcopyout (a[0:N], b[0:N], c[0:N])
+  {
+    a[0] = 0;
+    b[0] = 1;
+    c[0] = a[0];
+  }
+
+  if (a[0] != 1 || b[0] != 1 || c[0] != 1)
+    abort ();
+
+  free (a);
+  free (c);
+}
diff --git a/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta.c b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta.c
new file mode 100644
index 0000000..44d4fd2
--- /dev/null
+++ b/libgomp/testsuite/libgomp.oacc-c-c++-common/kernels-alias-ipa-pta.c
@@ -0,0 +1,26 @@
+/* { dg-additional-options "-O2 -fipa-pta" } */
+
+#include <stdlib.h>
+
+#define N 2
+
+int
+main (void)
+{
+  unsigned int a[N];
+  unsigned int b[N];
+  unsigned int c[N];
+
+#pragma acc kernels pcopyout (a, b, c)
+  {
+    a[0] = 0;
+    b[0] = 1;
+    c[0] = a[0];
+  }
+
+  if (a[0] != 0 || b[0] != 1 || c[0] != 0)
+    abort ();
+
+  return 0;
+}
+

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-01 14:27         ` [PATCH] Handle BUILT_IN_GOACC_PARALLEL " Tom de Vries
@ 2015-12-01 14:38           ` Richard Biener
  2015-12-01 14:44           ` Jakub Jelinek
  2015-12-02 17:58           ` Thomas Schwinge
  2 siblings, 0 replies; 44+ messages in thread
From: Richard Biener @ 2015-12-01 14:38 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Jakub Jelinek, gcc-patches

On Tue, 1 Dec 2015, Tom de Vries wrote:

> [ was: Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta ]
> 
> On 30/11/15 17:36, Tom de Vries wrote:
> > On 30/11/15 14:24, Richard Biener wrote:
> > > On Mon, 30 Nov 2015, Tom de Vries wrote:
> > > 
> > > > On 30/11/15 10:16, Richard Biener wrote:
> > > > > On Mon, 30 Nov 2015, Tom de Vries wrote:
> > > > > 
> > > > > > Hi,
> > > > > > 
> > > > > > this patch fixes PR46032.
> > > > > > 
> > > > > > It handles a call:
> > > > > > ...
> > > > > >     __builtin_GOMP_parallel (fn, data, num_threads, flags)
> > > > > > ...
> > > > > > as:
> > > > > > ...
> > > > > >     fn (data)
> > > > > > ...
> > > > > > in ipa-pta.
> > > > > > 
> > > > > > This improves ipa-pta alias analysis in the parallelized function
> > > > > > fn,
> 
> This follow-up patch does the same for BUILT_IN_GOACC_PARALLEL.
> 
> Bootstrapped and reg-tested on x86_64.
> 
> OK for stage3 trunk?

Ok.

Richard.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-01 14:27         ` [PATCH] Handle BUILT_IN_GOACC_PARALLEL " Tom de Vries
  2015-12-01 14:38           ` Richard Biener
@ 2015-12-01 14:44           ` Jakub Jelinek
  2015-12-01 23:48             ` Tom de Vries
  2015-12-02 17:58           ` Thomas Schwinge
  2 siblings, 1 reply; 44+ messages in thread
From: Jakub Jelinek @ 2015-12-01 14:44 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Biener, gcc-patches

On Tue, Dec 01, 2015 at 03:25:42PM +0100, Tom de Vries wrote:
> Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
> 
> 2015-12-01  Tom de Vries  <tom@codesourcery.com>
> 
> 	* tree-ssa-structalias.c (find_func_aliases_for_builtin_call)
> 	(find_func_clobbers, ipa_pta_execute): Handle BUILT_IN_GOACC_PARALLEL.

Isn't this cheating though?  The kernel will be called with those addresses
only if doing host fallback (and for GOMP_target_ext even not for that
always - firstprivate vars will have the addresses replaced by addresses of
alloca-ed copies of those objects).
I haven't studied in detail what exactly IPA-PTA does, so maybe it is good
enough to pretend that.

	Jakub

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-12-01 12:30               ` Jakub Jelinek
@ 2015-12-01 14:49                 ` Tom de Vries
  0 siblings, 0 replies; 44+ messages in thread
From: Tom de Vries @ 2015-12-01 14:49 UTC (permalink / raw)
  To: Jakub Jelinek, Christophe Lyon; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1069 bytes --]

On 01/12/15 13:29, Jakub Jelinek wrote:
> On Tue, Dec 01, 2015 at 01:27:32PM +0100, Christophe Lyon wrote:
>> >I've committed the attached patch as obvious: it adds
>> >dg-require-effective-target fopenmp to these new tests
>> >so that they are skipped e.g. on arm bare-metal targets
>> >(using newlib).
>> >
>> >Note that pr46032.c has some failures:
>> >FAIL:  gcc.dg/pr46032.c scan-tree-dump-times vect "note: vectorized 1 loop" 1
>> >on arm-none-linux-gnueabi, on arm-none-linux-gnueabihf with -mfpu=vfp*,
>> >and on armeb-none-linux-gnueabihf
>> >
>> >I haven't looked at the details yet; see
>> >http://people.linaro.org/~christophe.lyon/cross-validation/gcc/trunk/231076/report-build-info.html
>> >for more information.

> Supposedly pr46032-{2,3}.c should go into testsuite/gcc.dg/gomp/ instead and
> pr46032.c into testsuite/gcc.dg/vect/ (with the fopenmp effective target and
> perhaps some other effective target conditions)?

I've moved the tests, and added dg-require-effective-target vect_int in 
pr46032.c.

Committed to trunk as obvious.

Thanks,
- Tom

[-- Attachment #2: 0001-Move-pr46032-.c-tests.patch --]
[-- Type: text/x-patch, Size: 7273 bytes --]

Move pr46032*.c tests

2015-12-01  Tom de Vries  <tom@codesourcery.com>

	* gcc.dg/pr46032.c: Move to ...
	* gcc.dg/vect/pr46032.c: here.  Add dg-require-effective-target
	vect_int.
	* gcc.dg/pr46032-2.c: Move to ...
	* gcc.dg/gomp/pr46032-2.c: ... here.  Drop dg-require-effective-target fopenmp.
	* gcc.dg/pr46032-3.c: Move to ...
	* gcc.dg/gomp/pr46032-3.c: ... here.  Drop dg-require-effective-target fopenmp.

---
 gcc/testsuite/gcc.dg/gomp/pr46032-2.c | 29 +++++++++++++++++++++
 gcc/testsuite/gcc.dg/gomp/pr46032-3.c | 28 ++++++++++++++++++++
 gcc/testsuite/gcc.dg/pr46032-2.c      | 30 ---------------------
 gcc/testsuite/gcc.dg/pr46032-3.c      | 29 ---------------------
 gcc/testsuite/gcc.dg/pr46032.c        | 48 ----------------------------------
 gcc/testsuite/gcc.dg/vect/pr46032.c   | 49 +++++++++++++++++++++++++++++++++++
 6 files changed, 106 insertions(+), 107 deletions(-)

diff --git a/gcc/testsuite/gcc.dg/gomp/pr46032-2.c b/gcc/testsuite/gcc.dg/gomp/pr46032-2.c
new file mode 100644
index 0000000..e110880
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/pr46032-2.c
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fopenmp -std=c99 -fipa-pta -fdump-tree-optimized" } */
+
+#define N 2
+
+int
+foo (void)
+{
+  int a[N], b[N], c[N];
+  int *ap = &a[0];
+  int *bp = &b[0];
+  int *cp = &c[0];
+
+#pragma omp parallel for
+  for (unsigned int idx = 0; idx < N; idx++)
+    {
+      ap[idx] = 1;
+      bp[idx] = 2;
+      cp[idx] = ap[idx];
+    }
+
+  return *cp;
+}
+
+/* { dg-final { scan-tree-dump-times "\\] = 1;" 2 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = 2;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = _\[0-9\]*;" 0 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = " 3 "optimized" } } */
+
diff --git a/gcc/testsuite/gcc.dg/gomp/pr46032-3.c b/gcc/testsuite/gcc.dg/gomp/pr46032-3.c
new file mode 100644
index 0000000..a4af7ec
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/gomp/pr46032-3.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fopenmp -std=c99 -fipa-pta -fdump-tree-optimized" } */
+
+#define N 2
+
+int
+foo (void)
+{
+  int a[N], c[N];
+  int *ap = &a[0];
+  int *bp = &a[0];
+  int *cp = &c[0];
+
+#pragma omp parallel for
+  for (unsigned int idx = 0; idx < N; idx++)
+    {
+      ap[idx] = 1;
+      bp[idx] = 2;
+      cp[idx] = ap[idx];
+    }
+
+  return *cp;
+}
+
+/* { dg-final { scan-tree-dump-times "\\] = 1;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = 2;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = _\[0-9\]*;" 1 "optimized" } } */
+/* { dg-final { scan-tree-dump-times "\\] = " 3 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/pr46032-2.c b/gcc/testsuite/gcc.dg/pr46032-2.c
deleted file mode 100644
index d769597..0000000
--- a/gcc/testsuite/gcc.dg/pr46032-2.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target fopenmp } */
-/* { dg-options "-O2 -fopenmp -std=c99 -fipa-pta -fdump-tree-optimized" } */
-
-#define N 2
-
-int
-foo (void)
-{
-  int a[N], b[N], c[N];
-  int *ap = &a[0];
-  int *bp = &b[0];
-  int *cp = &c[0];
-
-#pragma omp parallel for
-  for (unsigned int idx = 0; idx < N; idx++)
-    {
-      ap[idx] = 1;
-      bp[idx] = 2;
-      cp[idx] = ap[idx];
-    }
-
-  return *cp;
-}
-
-/* { dg-final { scan-tree-dump-times "\\] = 1;" 2 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "\\] = 2;" 1 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "\\] = _\[0-9\]*;" 0 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "\\] = " 3 "optimized" } } */
-
diff --git a/gcc/testsuite/gcc.dg/pr46032-3.c b/gcc/testsuite/gcc.dg/pr46032-3.c
deleted file mode 100644
index a9e74d0..0000000
--- a/gcc/testsuite/gcc.dg/pr46032-3.c
+++ /dev/null
@@ -1,29 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target fopenmp } */
-/* { dg-options "-O2 -fopenmp -std=c99 -fipa-pta -fdump-tree-optimized" } */
-
-#define N 2
-
-int
-foo (void)
-{
-  int a[N], c[N];
-  int *ap = &a[0];
-  int *bp = &a[0];
-  int *cp = &c[0];
-
-#pragma omp parallel for
-  for (unsigned int idx = 0; idx < N; idx++)
-    {
-      ap[idx] = 1;
-      bp[idx] = 2;
-      cp[idx] = ap[idx];
-    }
-
-  return *cp;
-}
-
-/* { dg-final { scan-tree-dump-times "\\] = 1;" 1 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "\\] = 2;" 1 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "\\] = _\[0-9\]*;" 1 "optimized" } } */
-/* { dg-final { scan-tree-dump-times "\\] = " 3 "optimized" } } */
diff --git a/gcc/testsuite/gcc.dg/pr46032.c b/gcc/testsuite/gcc.dg/pr46032.c
deleted file mode 100644
index aae0019..0000000
--- a/gcc/testsuite/gcc.dg/pr46032.c
+++ /dev/null
@@ -1,48 +0,0 @@
-/* { dg-do compile } */
-/* { dg-require-effective-target fopenmp } */
-/* { dg-options "-O2 -fopenmp -ftree-vectorize -std=c99 -fipa-pta -fdump-tree-vect-all" } */
-
-extern void abort (void);
-
-#define nEvents 1000
-
-static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
-init (unsigned *results, unsigned *pData)
-{
-  unsigned int i;
-  for (i = 0; i < nEvents; ++i)
-    pData[i] = i % 3;
-}
-
-static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
-check (unsigned *results)
-{
-  unsigned sum = 0;
-  for (int idx = 0; idx < (int)nEvents; idx++)
-    sum += results[idx];
-
-  if (sum != 1998)
-    abort ();
-}
-
-int
-main (void)
-{
-  unsigned results[nEvents];
-  unsigned pData[nEvents];
-  unsigned coeff = 2;
-
-  init (&results[0], &pData[0]);
-
-#pragma omp parallel for
-  for (int idx = 0; idx < (int)nEvents; idx++)
-    results[idx] = coeff * pData[idx];
-
-  check (&results[0]);
-
-  return 0;
-}
-
-/* { dg-final { scan-tree-dump-times "note: vectorized 1 loop" 1 "vect" } } */
-/* { dg-final { scan-tree-dump-not "versioning for alias required" "vect" } } */
-
diff --git a/gcc/testsuite/gcc.dg/vect/pr46032.c b/gcc/testsuite/gcc.dg/vect/pr46032.c
new file mode 100644
index 0000000..8aa725a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr46032.c
@@ -0,0 +1,49 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target fopenmp } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-options "-O2 -fopenmp -ftree-vectorize -std=c99 -fipa-pta -fdump-tree-vect-all" } */
+
+extern void abort (void);
+
+#define nEvents 1000
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+init (unsigned *results, unsigned *pData)
+{
+  unsigned int i;
+  for (i = 0; i < nEvents; ++i)
+    pData[i] = i % 3;
+}
+
+static void __attribute__((noinline, noclone, optimize("-fno-tree-vectorize")))
+check (unsigned *results)
+{
+  unsigned sum = 0;
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    sum += results[idx];
+
+  if (sum != 1998)
+    abort ();
+}
+
+int
+main (void)
+{
+  unsigned results[nEvents];
+  unsigned pData[nEvents];
+  unsigned coeff = 2;
+
+  init (&results[0], &pData[0]);
+
+#pragma omp parallel for
+  for (int idx = 0; idx < (int)nEvents; idx++)
+    results[idx] = coeff * pData[idx];
+
+  check (&results[0]);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "note: vectorized 1 loop" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-not "versioning for alias required" "vect" } } */
+

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-01 14:44           ` Jakub Jelinek
@ 2015-12-01 23:48             ` Tom de Vries
  2015-12-02  9:31               ` Jakub Jelinek
  0 siblings, 1 reply; 44+ messages in thread
From: Tom de Vries @ 2015-12-01 23:48 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Biener, gcc-patches

On 01/12/15 15:44, Jakub Jelinek wrote:
> On Tue, Dec 01, 2015 at 03:25:42PM +0100, Tom de Vries wrote:
>> Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
>>
>> 2015-12-01  Tom de Vries  <tom@codesourcery.com>
>>
>> 	* tree-ssa-structalias.c (find_func_aliases_for_builtin_call)
>> 	(find_func_clobbers, ipa_pta_execute): Handle BUILT_IN_GOACC_PARALLEL.
>
> Isn't this cheating though?  The kernel will be called with those addresses
> only if doing host fallback

Let's take a look at goacc/kernels-alias-ipa-pta.c:
...
unsigned int a[N];
unsigned int b[N];
unsigned int c[N];

#pragma acc kernels pcopyout (a, b, c)
{
   a[0] = 0;
   b[0] = 1;
   c[0] = a[0];
}
...

If we execute on the host, the a, b and c used in the kernels region 
will be the a, b and c declared outside the region.

If we execute on a non-shared mem accelerator, the a, b and c used in 
the kernels region will be copies of a, b and c in the accelerator 
memory: a.1, b.1 and c.1.

This patch tells ipa-pta (which has no notion of a.1, b.1 and c.1) that 
we're using declared a, b, and c, in the kernels region, while on the 
accelerator we're really using a.1, b.1 and c.1, so in that sense it's 
cheating.

However, given that declared a, b and c are disjunct, we know that their 
copies will be disjunct, so by pretending that declared a, b and c are 
used in the kernels region, we get conclusions which are also valid when 
we use a.1, b.1 and c.1 instead in the kernels region.

So, for this patch to be incorrect we have to find an example where 
ipa-pta finds that two memory references are not aliasing, while on the 
accelerator those memory references are really aliasing. AFAICT there 
are no such examples.

> (and for GOMP_target_ext even not for that
> always - firstprivate vars will have the addresses replaced by addresses of
> alloca-ed copies of those objects).

I don't think firstprivate vars is a problem, I think the opposite would 
a problem: merging vars on the accelerator which are disjunct on the host.

> I haven't studied in detail what exactly IPA-PTA does, so maybe it is good
> enough to pretend that.

AFAIU, it's good enough, because the points-to information is only used 
to prove non-aliases.

Does this explanation address your concern?

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-01 23:48             ` Tom de Vries
@ 2015-12-02  9:31               ` Jakub Jelinek
  0 siblings, 0 replies; 44+ messages in thread
From: Jakub Jelinek @ 2015-12-02  9:31 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Biener, gcc-patches

On Wed, Dec 02, 2015 at 12:46:47AM +0100, Tom de Vries wrote:
> Does this explanation address your concern?

Yeah, for now it is fine I hope.

	Jakub

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-01 14:27         ` [PATCH] Handle BUILT_IN_GOACC_PARALLEL " Tom de Vries
  2015-12-01 14:38           ` Richard Biener
  2015-12-01 14:44           ` Jakub Jelinek
@ 2015-12-02 17:58           ` Thomas Schwinge
  2015-12-02 23:32             ` Tom de Vries
  2 siblings, 1 reply; 44+ messages in thread
From: Thomas Schwinge @ 2015-12-02 17:58 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Jakub Jelinek, gcc-patches, Richard Biener

[-- Attachment #1: Type: text/plain, Size: 3727 bytes --]

Hi!

On Tue, 1 Dec 2015 15:25:42 +0100, Tom de Vries <Tom_deVries@mentor.com> wrote:
> Handle BUILT_IN_GOACC_PARALLEL in ipa-pta

> 	* c-c++-common/goacc/kernels-alias-ipa-pta-2.c: New test.
> 	* c-c++-common/goacc/kernels-alias-ipa-pta-3.c: New test.
> 	* c-c++-common/goacc/kernels-alias-ipa-pta.c: New test.

I see:

    PASS: c-c++-common/goacc/kernels-alias-ipa-pta-2.c (test for excess errors)
    FAIL: c-c++-common/goacc/kernels-alias-ipa-pta-2.c scan-tree-dump-times optimized "(?n)= 0;$" 2
    PASS: c-c++-common/goacc/kernels-alias-ipa-pta-2.c scan-tree-dump-times optimized "(?n)= 1;$" 1
    FAIL: c-c++-common/goacc/kernels-alias-ipa-pta-2.c scan-tree-dump-times optimized "(?n)= \\*a" 0
    PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c (test for excess errors)
    PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c scan-tree-dump-times optimized "(?n)= 0;$" 1
    PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c scan-tree-dump-times optimized "(?n)= 1;$" 1
    PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c scan-tree-dump-times optimized "(?n)= \\*a" 1
    PASS: c-c++-common/goacc/kernels-alias-ipa-pta.c (test for excess errors)
    FAIL: c-c++-common/goacc/kernels-alias-ipa-pta.c scan-tree-dump-times optimized "(?n)= 0;$" 2
    PASS: c-c++-common/goacc/kernels-alias-ipa-pta.c scan-tree-dump-times optimized "(?n)= 1;$" 1
    FAIL: c-c++-common/goacc/kernels-alias-ipa-pta.c scan-tree-dump-times optimized "(?n)= \\*_[0-9]\\[0\\];$" 0

..., and similar for C++.  Looking at
c-c++-common/goacc/kernels-alias-ipa-pta.c:

> --- /dev/null
> +++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta.c
> @@ -0,0 +1,23 @@
> +/* { dg-additional-options "-O2" } */
> +/* { dg-additional-options "-fipa-pta -fdump-tree-optimized" } */
> +
> +#define N 2
> +
> +void
> +foo (void)
> +{
> +  unsigned int a[N];
> +  unsigned int b[N];
> +  unsigned int c[N];
> +
> +#pragma acc kernels pcopyout (a, b, c)
> +  {
> +    a[0] = 0;
> +    b[0] = 1;
> +    c[0] = a[0];
> +  }
> +}
> +
> +/* { dg-final { scan-tree-dump-times "(?n)= 0;$" 2 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "(?n)= 1;$" 1 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times "(?n)= \\*_\[0-9\]\\\[0\\\];$" 0 "optimized" } } */

..., manually running that one for C, I get:

    ;; Function foo._omp_fn.0 (foo._omp_fn.0, funcdef_no=1, decl_uid=1874, cgraph_uid=1, symbol_order=1)
    
    __attribute__((oacc function (1, 1, 1), omp target entrypoint))
    foo._omp_fn.0 (const struct .omp_data_t.0 & restrict .omp_data_i)
    {
      unsigned int[2] * _3;
      unsigned int[2] * _5;
      unsigned int _7;
      unsigned int[2] * _8;
    
      <bb 2>:
      _3 = *.omp_data_i_2(D).a;
      *_3[0] = 0;
      _5 = *.omp_data_i_2(D).b;
      *_5[0] = 1;
      _7 = *_3[0];
      _8 = *.omp_data_i_2(D).c;
      *_8[0] = _7;
      return;
    
    }
    
    
    
    ;; Function foo (foo, funcdef_no=0, decl_uid=1866, cgraph_uid=0, symbol_order=0)
    
    foo ()
    {
      unsigned int c[2];
      unsigned int b[2];
      unsigned int a[2];
      struct .omp_data_t.0 .omp_data_arr.1;
      static long unsigned int .omp_data_sizes.2[3] = {8, 8, 8};
      static short unsigned int .omp_data_kinds.3[3] = {514, 514, 514};
    
      <bb 2>:
      .omp_data_arr.1.c = &c;
      .omp_data_arr.1.b = &b;
      .omp_data_arr.1.a = &a;
      GOACC_parallel_keyed (-1, foo._omp_fn.0, 3, &.omp_data_arr.1, &.omp_data_sizes.2, &.omp_data_kinds.3, 0);
      .omp_data_arr.1 ={v} {CLOBBER};
      a ={v} {CLOBBER};
      b ={v} {CLOBBER};
      c ={v} {CLOBBER};
      return;
    
    }


Grüße
 Thomas

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 472 bytes --]

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-02 17:58           ` Thomas Schwinge
@ 2015-12-02 23:32             ` Tom de Vries
  2015-12-03  0:10               ` Tom de Vries
  0 siblings, 1 reply; 44+ messages in thread
From: Tom de Vries @ 2015-12-02 23:32 UTC (permalink / raw)
  To: Thomas Schwinge; +Cc: Jakub Jelinek, gcc-patches, Richard Biener

[-- Attachment #1: Type: text/plain, Size: 3460 bytes --]

On 02/12/15 18:58, Thomas Schwinge wrote:
> Hi!
>
> On Tue, 1 Dec 2015 15:25:42 +0100, Tom de Vries<Tom_deVries@mentor.com>  wrote:
>> >Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
>> >	* c-c++-common/goacc/kernels-alias-ipa-pta-2.c: New test.
>> >	* c-c++-common/goacc/kernels-alias-ipa-pta-3.c: New test.
>> >	* c-c++-common/goacc/kernels-alias-ipa-pta.c: New test.
> I see:
>
>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-2.c (test for excess errors)
>      FAIL: c-c++-common/goacc/kernels-alias-ipa-pta-2.c scan-tree-dump-times optimized "(?n)= 0;$" 2
>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-2.c scan-tree-dump-times optimized "(?n)= 1;$" 1
>      FAIL: c-c++-common/goacc/kernels-alias-ipa-pta-2.c scan-tree-dump-times optimized "(?n)= \\*a" 0
>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c (test for excess errors)
>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c scan-tree-dump-times optimized "(?n)= 0;$" 1
>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c scan-tree-dump-times optimized "(?n)= 1;$" 1
>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c scan-tree-dump-times optimized "(?n)= \\*a" 1
>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta.c (test for excess errors)
>      FAIL: c-c++-common/goacc/kernels-alias-ipa-pta.c scan-tree-dump-times optimized "(?n)= 0;$" 2
>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta.c scan-tree-dump-times optimized "(?n)= 1;$" 1
>      FAIL: c-c++-common/goacc/kernels-alias-ipa-pta.c scan-tree-dump-times optimized "(?n)= \\*_[0-9]\\[0\\];$" 0
>
> ..., and similar for C++.

Curious, I get all passes for both C and C++ (at r231192).

>  Looking at
> c-c++-common/goacc/kernels-alias-ipa-pta.c:
>
>> >--- /dev/null
>> >+++ b/gcc/testsuite/c-c++-common/goacc/kernels-alias-ipa-pta.c
>> >@@ -0,0 +1,23 @@
>> >+/* { dg-additional-options "-O2" } */
>> >+/* { dg-additional-options "-fipa-pta -fdump-tree-optimized" } */
>> >+
>> >+#define N 2
>> >+
>> >+void
>> >+foo (void)
>> >+{
>> >+  unsigned int a[N];
>> >+  unsigned int b[N];
>> >+  unsigned int c[N];
>> >+
>> >+#pragma acc kernels pcopyout (a, b, c)
>> >+  {
>> >+    a[0] = 0;
>> >+    b[0] = 1;
>> >+    c[0] = a[0];
>> >+  }
>> >+}
>> >+
>> >+/* { dg-final { scan-tree-dump-times "(?n)= 0;$" 2 "optimized" } } */
>> >+/* { dg-final { scan-tree-dump-times "(?n)= 1;$" 1 "optimized" } } */
>> >+/* { dg-final { scan-tree-dump-times "(?n)= \\*_\[0-9\]\\\[0\\\];$" 0 "optimized" } } */
> ..., manually running that one for C, I get:
>
>      ;; Function foo._omp_fn.0 (foo._omp_fn.0, funcdef_no=1, decl_uid=1874, cgraph_uid=1, symbol_order=1)
>
>      __attribute__((oacc function (1, 1, 1), omp target entrypoint))
>      foo._omp_fn.0 (const struct .omp_data_t.0 & restrict .omp_data_i)
>      {
>        unsigned int[2] * _3;
>        unsigned int[2] * _5;
>        unsigned int _7;
>        unsigned int[2] * _8;
>
>        <bb 2>:
>        _3 = *.omp_data_i_2(D).a;
>        *_3[0] = 0;
>        _5 = *.omp_data_i_2(D).b;
>        *_5[0] = 1;
>        _7 = *_3[0];
>        _8 = *.omp_data_i_2(D).c;
>        *_8[0] = _7;
>        return;
>
>      }

Indeed, the optimization hasn't taken place here so it's correct that 
the scan fails.

I've attached the kernels-alias-ipa-pta.c.201t.optimized which I get, 
and it's clear the optimization is happening there (which explains why I 
get all PASSES).

The question is, why is the optimization not happening for you.

Thanks,
- Tom


[-- Attachment #2: kernels-alias-ipa-pta.c.201t.optimized --]
[-- Type: text/plain, Size: 1099 bytes --]


;; Function foo._omp_fn.0 (foo._omp_fn.0, funcdef_no=1, decl_uid=1874, cgraph_uid=1, symbol_order=1)

__attribute__((oacc function (1, 1, 1), omp target entrypoint))
foo._omp_fn.0 (const struct .omp_data_t.0 & restrict .omp_data_i)
{
  unsigned int[2] * _3;
  unsigned int[2] * _5;
  unsigned int[2] * _8;

  <bb 2>:
  _3 = *.omp_data_i_2(D).a;
  *_3[0] = 0;
  _5 = *.omp_data_i_2(D).b;
  *_5[0] = 1;
  _8 = *.omp_data_i_2(D).c;
  *_8[0] = 0;
  return;

}



;; Function foo (foo, funcdef_no=0, decl_uid=1866, cgraph_uid=0, symbol_order=0)

foo ()
{
  unsigned int c[2];
  unsigned int b[2];
  unsigned int a[2];
  struct .omp_data_t.0 .omp_data_arr.1;
  static long unsigned int .omp_data_sizes.2[3] = {8, 8, 8};
  static short unsigned int .omp_data_kinds.3[3] = {514, 514, 514};

  <bb 2>:
  .omp_data_arr.1.c = &c;
  .omp_data_arr.1.b = &b;
  .omp_data_arr.1.a = &a;
  __builtin_GOACC_parallel_keyed (-1, foo._omp_fn.0, 3, &.omp_data_arr.1, &.omp_data_sizes.2, &.omp_data_kinds.3, 0);
  .omp_data_arr.1 ={v} {CLOBBER};
  a ={v} {CLOBBER};
  b ={v} {CLOBBER};
  c ={v} {CLOBBER};
  return;

}



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-02 23:32             ` Tom de Vries
@ 2015-12-03  0:10               ` Tom de Vries
  2015-12-03  0:27                 ` Tom de Vries
  0 siblings, 1 reply; 44+ messages in thread
From: Tom de Vries @ 2015-12-03  0:10 UTC (permalink / raw)
  To: Thomas Schwinge; +Cc: Jakub Jelinek, gcc-patches, Richard Biener

On 03/12/15 00:31, Tom de Vries wrote:
> On 02/12/15 18:58, Thomas Schwinge wrote:
>> Hi!
>>
>> On Tue, 1 Dec 2015 15:25:42 +0100, Tom de
>> Vries<Tom_deVries@mentor.com>  wrote:
>>> >Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
>>> >    * c-c++-common/goacc/kernels-alias-ipa-pta-2.c: New test.
>>> >    * c-c++-common/goacc/kernels-alias-ipa-pta-3.c: New test.
>>> >    * c-c++-common/goacc/kernels-alias-ipa-pta.c: New test.
>> I see:
>>
>>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-2.c (test for
>> excess errors)
>>      FAIL: c-c++-common/goacc/kernels-alias-ipa-pta-2.c
>> scan-tree-dump-times optimized "(?n)= 0;$" 2
>>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-2.c
>> scan-tree-dump-times optimized "(?n)= 1;$" 1
>>      FAIL: c-c++-common/goacc/kernels-alias-ipa-pta-2.c
>> scan-tree-dump-times optimized "(?n)= \\*a" 0
>>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c (test for
>> excess errors)
>>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c
>> scan-tree-dump-times optimized "(?n)= 0;$" 1
>>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c
>> scan-tree-dump-times optimized "(?n)= 1;$" 1
>>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta-3.c
>> scan-tree-dump-times optimized "(?n)= \\*a" 1
>>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta.c (test for excess
>> errors)
>>      FAIL: c-c++-common/goacc/kernels-alias-ipa-pta.c
>> scan-tree-dump-times optimized "(?n)= 0;$" 2
>>      PASS: c-c++-common/goacc/kernels-alias-ipa-pta.c
>> scan-tree-dump-times optimized "(?n)= 1;$" 1
>>      FAIL: c-c++-common/goacc/kernels-alias-ipa-pta.c
>> scan-tree-dump-times optimized "(?n)= \\*_[0-9]\\[0\\];$" 0
>>
>> ..., and similar for C++.
>
> Curious, I get all passes for both C and C++ (at r231192).
>

I've managed to reproduce it. The difference between pass and fail is 
whether the compiler is configured with or without accelerator.

I'll look into it.

Thanks,
- Tom


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-03  0:10               ` Tom de Vries
@ 2015-12-03  0:27                 ` Tom de Vries
  2015-12-03  8:59                   ` Richard Biener
  0 siblings, 1 reply; 44+ messages in thread
From: Tom de Vries @ 2015-12-03  0:27 UTC (permalink / raw)
  To: Thomas Schwinge; +Cc: Jakub Jelinek, gcc-patches, Richard Biener

On 03/12/15 01:10, Tom de Vries wrote:
>
> I've managed to reproduce it. The difference between pass and fail is
> whether the compiler is configured with or without accelerator.
>
> I'll look into it.

In the configuration with accelerator, the flag node->force_output is on 
for foo._omp.fn.

This causes nonlocal_p to be true in ipa_pta_execute, which causes the 
optimization to fail.

The flag is decribed as:
...
   /* The symbol will be assumed to be used in an invisible way (like
      by an toplevel asm statement).  */
  ...

Looks like I have to ignore the force_output flag as well in 
ipa_pta_execute for this sort of node.

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-03  0:27                 ` Tom de Vries
@ 2015-12-03  8:59                   ` Richard Biener
  2015-12-03 11:09                     ` Tom de Vries
  0 siblings, 1 reply; 44+ messages in thread
From: Richard Biener @ 2015-12-03  8:59 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Thomas Schwinge, Jakub Jelinek, gcc-patches

On Thu, 3 Dec 2015, Tom de Vries wrote:

> On 03/12/15 01:10, Tom de Vries wrote:
> > 
> > I've managed to reproduce it. The difference between pass and fail is
> > whether the compiler is configured with or without accelerator.
> > 
> > I'll look into it.
> 
> In the configuration with accelerator, the flag node->force_output is on for
> foo._omp.fn.
> 
> This causes nonlocal_p to be true in ipa_pta_execute, which causes the
> optimization to fail.
> 
> The flag is decribed as:
> ...
>   /* The symbol will be assumed to be used in an invisible way (like
>      by an toplevel asm statement).  */
>  ...
> 
> Looks like I have to ignore the force_output flag as well in ipa_pta_execute
> for this sort of node.

It rather looks like the flag shouldn't be set.  The fn after all has
its address taken!(?)

Richard.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-03  8:59                   ` Richard Biener
@ 2015-12-03 11:09                     ` Tom de Vries
  2015-12-03 11:13                       ` Jakub Jelinek
                                         ` (2 more replies)
  0 siblings, 3 replies; 44+ messages in thread
From: Tom de Vries @ 2015-12-03 11:09 UTC (permalink / raw)
  To: Richard Biener; +Cc: Thomas Schwinge, Jakub Jelinek, gcc-patches

On 03/12/15 09:59, Richard Biener wrote:
> On Thu, 3 Dec 2015, Tom de Vries wrote:
>
>> On 03/12/15 01:10, Tom de Vries wrote:
>>>
>>> I've managed to reproduce it. The difference between pass and fail is
>>> whether the compiler is configured with or without accelerator.
>>>
>>> I'll look into it.
>>
>> In the configuration with accelerator, the flag node->force_output is on for
>> foo._omp.fn.
>>
>> This causes nonlocal_p to be true in ipa_pta_execute, which causes the
>> optimization to fail.
>>
>> The flag is decribed as:
>> ...
>>    /* The symbol will be assumed to be used in an invisible way (like
>>       by an toplevel asm statement).  */
>>   ...
>>
>> Looks like I have to ignore the force_output flag as well in ipa_pta_execute
>> for this sort of node.
>
> It rather looks like the flag shouldn't be set.  The fn after all has
> its address taken!(?)
>

The flag is set here in expand_omp_target:
...
12682         /* Prevent IPA from removing child_fn as unreachable,
                  since there are no
12683            refs from the parent function to child_fn in offload
                  LTO mode.  */
12684         if (ENABLE_OFFLOADING)
12685           cgraph_node::get (child_fn)->mark_force_output ();
...

I guess setting forced_by_abi instead would also mean child_fn is not 
removed as unreachable, while still allowing optimizations:
...
   /* Like FORCE_OUTPUT, but in the case it is ABI requiring the symbol
      to be exported.  Unlike FORCE_OUTPUT this flag gets cleared to
      symbols promoted to static and it does not inhibit
      optimization.  */
   unsigned forced_by_abi : 1;
...

But I suspect that other optimizations (than ipa-pta) might break things.

Essentially we have two situations:
- in the host compiler, there is no need for the forced_output flag,
   and it inhibits optimization
- in the accelerator compiler, it (or some equivalent) is needed

I wonder if setting the force_output flag only when streaming the 
bytecode for offloading would work. That way, it wouldn't be set in the 
host compiler, while being set in the accelerator compiler.

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-03 11:09                     ` Tom de Vries
  2015-12-03 11:13                       ` Jakub Jelinek
@ 2015-12-03 11:13                       ` Richard Biener
  2015-12-10 13:15                       ` Tom de Vries
  2 siblings, 0 replies; 44+ messages in thread
From: Richard Biener @ 2015-12-03 11:13 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Thomas Schwinge, Jakub Jelinek, gcc-patches

On Thu, 3 Dec 2015, Tom de Vries wrote:

> On 03/12/15 09:59, Richard Biener wrote:
> > On Thu, 3 Dec 2015, Tom de Vries wrote:
> > 
> > > On 03/12/15 01:10, Tom de Vries wrote:
> > > > 
> > > > I've managed to reproduce it. The difference between pass and fail is
> > > > whether the compiler is configured with or without accelerator.
> > > > 
> > > > I'll look into it.
> > > 
> > > In the configuration with accelerator, the flag node->force_output is on
> > > for
> > > foo._omp.fn.
> > > 
> > > This causes nonlocal_p to be true in ipa_pta_execute, which causes the
> > > optimization to fail.
> > > 
> > > The flag is decribed as:
> > > ...
> > >    /* The symbol will be assumed to be used in an invisible way (like
> > >       by an toplevel asm statement).  */
> > >   ...
> > > 
> > > Looks like I have to ignore the force_output flag as well in
> > > ipa_pta_execute
> > > for this sort of node.
> > 
> > It rather looks like the flag shouldn't be set.  The fn after all has
> > its address taken!(?)
> > 
> 
> The flag is set here in expand_omp_target:
> ...
> 12682         /* Prevent IPA from removing child_fn as unreachable,
>                  since there are no
> 12683            refs from the parent function to child_fn in offload
>                  LTO mode.  */
> 12684         if (ENABLE_OFFLOADING)
> 12685           cgraph_node::get (child_fn)->mark_force_output ();
> ...
> 

How are there no refs from the "parent"?  Are there not refs from
some kind of descriptor that maps fallback CPU and offloaded variants?

I think the above needs sorting out in somw way, making the refs
explicit rather than implicit via force_output.

> I guess setting forced_by_abi instead would also mean child_fn is not removed
> as unreachable, while still allowing optimizations:
> ...
>   /* Like FORCE_OUTPUT, but in the case it is ABI requiring the symbol
>      to be exported.  Unlike FORCE_OUTPUT this flag gets cleared to
>      symbols promoted to static and it does not inhibit
>      optimization.  */
>   unsigned forced_by_abi : 1;
> ...
> 
> But I suspect that other optimizations (than ipa-pta) might break things.

How so?

> Essentially we have two situations:
> - in the host compiler, there is no need for the forced_output flag,
>   and it inhibits optimization
> - in the accelerator compiler, it (or some equivalent) is needed
> 
> I wonder if setting the force_output flag only when streaming the bytecode for
> offloading would work. That way, it wouldn't be set in the host compiler,
> while being set in the accelerator compiler.

Yeah, that was my original thinking btw.

Richard.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-03 11:09                     ` Tom de Vries
@ 2015-12-03 11:13                       ` Jakub Jelinek
  2015-12-03 11:13                       ` Richard Biener
  2015-12-10 13:15                       ` Tom de Vries
  2 siblings, 0 replies; 44+ messages in thread
From: Jakub Jelinek @ 2015-12-03 11:13 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Biener, Thomas Schwinge, gcc-patches

On Thu, Dec 03, 2015 at 12:09:04PM +0100, Tom de Vries wrote:
> The flag is set here in expand_omp_target:
> ...
> 12682         /* Prevent IPA from removing child_fn as unreachable,
>                  since there are no
> 12683            refs from the parent function to child_fn in offload
>                  LTO mode.  */
> 12684         if (ENABLE_OFFLOADING)
> 12685           cgraph_node::get (child_fn)->mark_force_output ();
> ...
> 
> I guess setting forced_by_abi instead would also mean child_fn is not
> removed as unreachable, while still allowing optimizations:
> ...
>   /* Like FORCE_OUTPUT, but in the case it is ABI requiring the symbol
>      to be exported.  Unlike FORCE_OUTPUT this flag gets cleared to
>      symbols promoted to static and it does not inhibit
>      optimization.  */
>   unsigned forced_by_abi : 1;
> ...
> 
> But I suspect that other optimizations (than ipa-pta) might break things.
> 
> Essentially we have two situations:
> - in the host compiler, there is no need for the forced_output flag,
>   and it inhibits optimization
> - in the accelerator compiler, it (or some equivalent) is needed
> 
> I wonder if setting the force_output flag only when streaming the bytecode
> for offloading would work. That way, it wouldn't be set in the host
> compiler, while being set in the accelerator compiler.

I believe that the host and offload func (and var) tables need to be in
sync, so there needs to be something both in the host and accel compilers
that prevents the functions and variables that have their accel or host
counterpart in the tables from being optimized away, or say replaced by
a clone with different arguments etc.

	Jakub

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta
  2015-11-30 13:35       ` Jakub Jelinek
@ 2015-12-03 11:49         ` Tom de Vries
  0 siblings, 0 replies; 44+ messages in thread
From: Tom de Vries @ 2015-12-03 11:49 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Biener, gcc-patches

On 30/11/15 14:32, Jakub Jelinek wrote:
> On Mon, Nov 30, 2015 at 02:24:18PM +0100, Richard Biener wrote:
>>> OK for stage3 trunk if bootstrap and reg-test succeeds?
>>
>> -                        || node->address_taken);
>> +                        || (node->address_taken
>> +                            && !node->parallelized_function));
>>
>> please add a comment here on why this is safe.
>>
>> Ok with this change.
>
> BTW, __builting_GOMP_task supposedly can be treated similarly
> if the third argument is NULL (if 3rd arg is non-NULL, then
> the caller passes a different structure from what the callee receives,
> but perhaps it could be emulated as pretending that cpyfn is called first
> with address of a temporary var and the data argument and then fn
> is called with the address of the temporary var).

Filed as PR68673 - Handle __builtin_GOMP_task optimally in ipa-pta.

Can you provide testcases for both (3rd arg NULL/non-NULL) cases? I'm 
not fluent in openmp.

Thanks,
- Tom

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in find_func_clobbers
  2015-11-30 12:20   ` Tom de Vries
  2015-11-30 13:32     ` Richard Biener
@ 2015-12-09 10:01     ` Tom de Vries
  2015-12-09 10:04       ` Jakub Jelinek
  2015-12-10  9:01       ` [committed, obvious] Remove invalid assert in find_func_aliases_for_builtin_call Tom de Vries
  1 sibling, 2 replies; 44+ messages in thread
From: Tom de Vries @ 2015-12-09 10:01 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 4018 bytes --]

[ was: Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta ]

On 30/11/15 13:11, Tom de Vries wrote:
> On 30/11/15 10:16, Richard Biener wrote:
>> On Mon, 30 Nov 2015, Tom de Vries wrote:
>>
>>> Hi,
>>>
>>> this patch fixes PR46032.
>>>
>>> It handles a call:
>>> ...
>>>    __builtin_GOMP_parallel (fn, data, num_threads, flags)
>>> ...
>>> as:
>>> ...
>>>    fn (data)
>>> ...
>>> in ipa-pta.
>>>
>>> This improves ipa-pta alias analysis in the parallelized function fn,
>>> and
>>> allows vectorization in the testcase without a runtime alias test.
>>>
>>> Bootstrapped and reg-tested on x86_64.
>>>
>>> OK for stage3 trunk?
>>
>> +             /* Assign the passed argument to the appropriate incoming
>> +                parameter of the function.  */
>> +             struct constraint_expr lhs ;
>> +             lhs = get_function_part_constraint (fi, fi_parm_base + 0);
>> +             auto_vec<ce_s, 2> rhsc;
>> +             struct constraint_expr *rhsp;
>> +             get_constraint_for_rhs (arg, &rhsc);
>> +             while (rhsc.length () != 0)
>> +               {
>> +                 rhsp = &rhsc.last ();
>> +                 process_constraint (new_constraint (lhs, *rhsp));
>> +                 rhsc.pop ();
>> +               }
>>
>> please use style used elsewhere with
>>
>>               FOR_EACH_VEC_ELT (rhsc, j, rhsp)
>>                 process_constraint (new_constraint (lhs, *rhsp));
>>               rhsc.truncate (0);
>>
>
> That code was copied from find_func_aliases_for_call.
> I've factored out the bit that I copied as
> find_func_aliases_for_call_arg, and fixed the style there (and dropped
> 'rhsc.truncate (0)' since AFAIU it's redundant at the end of a function).
>
>> +             /* Parameter passed by value is used.  */
>> +             lhs = get_function_part_constraint (fi, fi_uses);
>> +             struct constraint_expr *rhsp;
>> +             get_constraint_for_address_of (arg, &rhsc);
>>
>> This isn't correct - you want to use get_constraint_for (arg, &rhsc).
>> After all rhs is already an ADDR_EXPR.
>>
>
> Can we add an assert somewhere to detect this incorrect usage?
>
>> +             FOR_EACH_VEC_ELT (rhsc, j, rhsp)
>> +               process_constraint (new_constraint (lhs, *rhsp));
>> +             rhsc.truncate (0);
>> +
>> +             /* The caller clobbers what the callee does.  */
>> +             lhs = get_function_part_constraint (fi, fi_clobbers);
>> +             rhs = get_function_part_constraint (cfi, fi_clobbers);
>> +             process_constraint (new_constraint (lhs, rhs));
>> +
>> +             /* The caller uses what the callee does.  */
>> +             lhs = get_function_part_constraint (fi, fi_uses);
>> +             rhs = get_function_part_constraint (cfi, fi_uses);
>> +             process_constraint (new_constraint (lhs, rhs));
>>
>> I don't see why you need those.  The solver should compute these
>> in even better precision (context sensitive on the call side).
>>
>> The same is true for the function parameter.  That is, the only
>> needed part of the patch should be that making sure we see
>> the "direct" call and assign parameters correctly.
>>
>
> Dropped this bit.

Dropping the find_func_clobbers bit (in particular, the part copying the 
clobbers) caused PR68716.

Basically the find_func_clobbers bit tries to emulate the generic 
handling of calls in find_func_clobbers, but pretends the call is
   fn (data)
when handling
   __builtin_GOMP_parallel (fn, data, num_threads, flags)
.

I used the same comments as in the generic call handling to make it 
clear what part of the generic call handling is emulated.

Compared to the originally posted patch, the changes are:
- removed 'gcc_assert (TREE_CODE (arg) == ADDR_EXPR)'. Ran into a case
   where arg is NULL.
- use get_constraint_for instead of get_constraint_for_address_of as
   requested in a review comment above.
- handle GOACC_parallel as well

Bootstrapped and reg-tested on x86_64.

OK for stage3 trunk?

Thanks,
- Tom


[-- Attachment #2: 0001-Fix-GOMP-GOACC_parallel-handling-in-find_func_clobbers.patch --]
[-- Type: text/x-patch, Size: 2819 bytes --]

Fix GOMP/GOACC_parallel handling in find_func_clobbers

2015-12-08  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/68716
	* tree-ssa-structalias.c (find_func_clobbers): Fix handling of
	BUILT_IN_GOMP_PARALLEL and BUILT_IN_GOMP_PARALLEL.

	* testsuite/libgomp.c/omp-nested-2.c: New test.

---
 gcc/tree-ssa-structalias.c                 | 47 +++++++++++++++++++++++++++++-
 libgomp/testsuite/libgomp.c/omp-nested-2.c |  4 +++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index 060ff3e..15e351e 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -5082,7 +5082,52 @@ find_func_clobbers (struct function *fn, gimple *origt)
 	    return;
 	  case BUILT_IN_GOMP_PARALLEL:
 	  case BUILT_IN_GOACC_PARALLEL:
-	    return;
+	    {
+	      unsigned int fnpos, argpos;
+	      switch (DECL_FUNCTION_CODE (decl))
+		{
+		case BUILT_IN_GOMP_PARALLEL:
+		  /* __builtin_GOMP_parallel (fn, data, num_threads, flags).  */
+		  fnpos = 0;
+		  argpos = 1;
+		  break;
+		case BUILT_IN_GOACC_PARALLEL:
+		  /* __builtin_GOACC_parallel (device, fn, mapnum, hostaddrs,
+					       sizes, kinds, ...).  */
+		  fnpos = 1;
+		  argpos = 3;
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+
+	      tree fnarg = gimple_call_arg (t, fnpos);
+	      gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
+	      tree fndecl = TREE_OPERAND (fnarg, 0);
+	      varinfo_t cfi = get_vi_for_tree (fndecl);
+
+	      tree arg = gimple_call_arg (t, argpos);
+
+	      /* Parameter passed by value is used.  */
+	      lhs = get_function_part_constraint (fi, fi_uses);
+	      struct constraint_expr *rhsp;
+	      get_constraint_for (arg, &rhsc);
+	      FOR_EACH_VEC_ELT (rhsc, j, rhsp)
+		process_constraint (new_constraint (lhs, *rhsp));
+	      rhsc.truncate (0);
+
+	      /* The caller clobbers what the callee does.  */
+	      lhs = get_function_part_constraint (fi, fi_clobbers);
+	      rhs = get_function_part_constraint (cfi, fi_clobbers);
+	      process_constraint (new_constraint (lhs, rhs));
+
+	      /* The caller uses what the callee does.  */
+	      lhs = get_function_part_constraint (fi, fi_uses);
+	      rhs = get_function_part_constraint (cfi, fi_uses);
+	      process_constraint (new_constraint (lhs, rhs));
+
+	      return;
+	    }
 	  /* printf-style functions may have hooks to set pointers to
 	     point to somewhere into the generated string.  Leave them
 	     for a later exercise...  */
diff --git a/libgomp/testsuite/libgomp.c/omp-nested-2.c b/libgomp/testsuite/libgomp.c/omp-nested-2.c
new file mode 100644
index 0000000..7495afb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/omp-nested-2.c
@@ -0,0 +1,4 @@
+// { dg-do run }
+// { dg-additional-options "-fipa-pta" }
+
+#include "omp-nested-1.c"

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in find_func_clobbers
  2015-12-09 10:01     ` [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in find_func_clobbers Tom de Vries
@ 2015-12-09 10:04       ` Jakub Jelinek
  2015-12-09 10:09         ` Richard Biener
  2015-12-09 10:11         ` Tom de Vries
  2015-12-10  9:01       ` [committed, obvious] Remove invalid assert in find_func_aliases_for_builtin_call Tom de Vries
  1 sibling, 2 replies; 44+ messages in thread
From: Jakub Jelinek @ 2015-12-09 10:04 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Biener, gcc-patches

On Wed, Dec 09, 2015 at 11:01:31AM +0100, Tom de Vries wrote:
> 	PR tree-optimization/68716
> 	* tree-ssa-structalias.c (find_func_clobbers): Fix handling of
> 	BUILT_IN_GOMP_PARALLEL and BUILT_IN_GOMP_PARALLEL.

Pasto in ChangeLog entry?

	Jakub

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in find_func_clobbers
  2015-12-09 10:04       ` Jakub Jelinek
@ 2015-12-09 10:09         ` Richard Biener
  2015-12-09 10:11         ` Tom de Vries
  1 sibling, 0 replies; 44+ messages in thread
From: Richard Biener @ 2015-12-09 10:09 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Tom de Vries, gcc-patches

On Wed, 9 Dec 2015, Jakub Jelinek wrote:

> On Wed, Dec 09, 2015 at 11:01:31AM +0100, Tom de Vries wrote:
> > 	PR tree-optimization/68716
> > 	* tree-ssa-structalias.c (find_func_clobbers): Fix handling of
> > 	BUILT_IN_GOMP_PARALLEL and BUILT_IN_GOMP_PARALLEL.
> 
> Pasto in ChangeLog entry?

Ok with that fixed.

Richard.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in find_func_clobbers
  2015-12-09 10:04       ` Jakub Jelinek
  2015-12-09 10:09         ` Richard Biener
@ 2015-12-09 10:11         ` Tom de Vries
  1 sibling, 0 replies; 44+ messages in thread
From: Tom de Vries @ 2015-12-09 10:11 UTC (permalink / raw)
  To: Jakub Jelinek; +Cc: Richard Biener, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 327 bytes --]

On 09/12/15 11:03, Jakub Jelinek wrote:
> On Wed, Dec 09, 2015 at 11:01:31AM +0100, Tom de Vries wrote:
>> 	PR tree-optimization/68716
>> 	* tree-ssa-structalias.c (find_func_clobbers): Fix handling of
>> 	BUILT_IN_GOMP_PARALLEL and BUILT_IN_GOMP_PARALLEL.
>
> Pasto in ChangeLog entry?
>

Indeed, thanks for noticing.

- Tom


[-- Attachment #2: 0001-Fix-GOMP-GOACC_parallel-handling-in-find_func_clobbers.patch --]
[-- Type: text/x-patch, Size: 2820 bytes --]

Fix GOMP/GOACC_parallel handling in find_func_clobbers

2015-12-08  Tom de Vries  <tom@codesourcery.com>

	PR tree-optimization/68716
	* tree-ssa-structalias.c (find_func_clobbers): Fix handling of
	BUILT_IN_GOMP_PARALLEL and BUILT_IN_GOACC_PARALLEL.

	* testsuite/libgomp.c/omp-nested-2.c: New test.

---
 gcc/tree-ssa-structalias.c                 | 47 +++++++++++++++++++++++++++++-
 libgomp/testsuite/libgomp.c/omp-nested-2.c |  4 +++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index 060ff3e..15e351e 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -5082,7 +5082,52 @@ find_func_clobbers (struct function *fn, gimple *origt)
 	    return;
 	  case BUILT_IN_GOMP_PARALLEL:
 	  case BUILT_IN_GOACC_PARALLEL:
-	    return;
+	    {
+	      unsigned int fnpos, argpos;
+	      switch (DECL_FUNCTION_CODE (decl))
+		{
+		case BUILT_IN_GOMP_PARALLEL:
+		  /* __builtin_GOMP_parallel (fn, data, num_threads, flags).  */
+		  fnpos = 0;
+		  argpos = 1;
+		  break;
+		case BUILT_IN_GOACC_PARALLEL:
+		  /* __builtin_GOACC_parallel (device, fn, mapnum, hostaddrs,
+					       sizes, kinds, ...).  */
+		  fnpos = 1;
+		  argpos = 3;
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+
+	      tree fnarg = gimple_call_arg (t, fnpos);
+	      gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
+	      tree fndecl = TREE_OPERAND (fnarg, 0);
+	      varinfo_t cfi = get_vi_for_tree (fndecl);
+
+	      tree arg = gimple_call_arg (t, argpos);
+
+	      /* Parameter passed by value is used.  */
+	      lhs = get_function_part_constraint (fi, fi_uses);
+	      struct constraint_expr *rhsp;
+	      get_constraint_for (arg, &rhsc);
+	      FOR_EACH_VEC_ELT (rhsc, j, rhsp)
+		process_constraint (new_constraint (lhs, *rhsp));
+	      rhsc.truncate (0);
+
+	      /* The caller clobbers what the callee does.  */
+	      lhs = get_function_part_constraint (fi, fi_clobbers);
+	      rhs = get_function_part_constraint (cfi, fi_clobbers);
+	      process_constraint (new_constraint (lhs, rhs));
+
+	      /* The caller uses what the callee does.  */
+	      lhs = get_function_part_constraint (fi, fi_uses);
+	      rhs = get_function_part_constraint (cfi, fi_uses);
+	      process_constraint (new_constraint (lhs, rhs));
+
+	      return;
+	    }
 	  /* printf-style functions may have hooks to set pointers to
 	     point to somewhere into the generated string.  Leave them
 	     for a later exercise...  */
diff --git a/libgomp/testsuite/libgomp.c/omp-nested-2.c b/libgomp/testsuite/libgomp.c/omp-nested-2.c
new file mode 100644
index 0000000..7495afb
--- /dev/null
+++ b/libgomp/testsuite/libgomp.c/omp-nested-2.c
@@ -0,0 +1,4 @@
+// { dg-do run }
+// { dg-additional-options "-fipa-pta" }
+
+#include "omp-nested-1.c"

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [committed, obvious] Remove invalid assert in find_func_aliases_for_builtin_call
  2015-12-09 10:01     ` [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in find_func_clobbers Tom de Vries
  2015-12-09 10:04       ` Jakub Jelinek
@ 2015-12-10  9:01       ` Tom de Vries
  1 sibling, 0 replies; 44+ messages in thread
From: Tom de Vries @ 2015-12-10  9:01 UTC (permalink / raw)
  To: Richard Biener; +Cc: Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 4188 bytes --]

[ was : Re: [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in 
find_func_clobbers ]

On 09/12/15 11:01, Tom de Vries wrote:
> [ was: Re: [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta ]
>
> On 30/11/15 13:11, Tom de Vries wrote:
>> On 30/11/15 10:16, Richard Biener wrote:
>>> On Mon, 30 Nov 2015, Tom de Vries wrote:
>>>
>>>> Hi,
>>>>
>>>> this patch fixes PR46032.
>>>>
>>>> It handles a call:
>>>> ...
>>>>    __builtin_GOMP_parallel (fn, data, num_threads, flags)
>>>> ...
>>>> as:
>>>> ...
>>>>    fn (data)
>>>> ...
>>>> in ipa-pta.
>>>>
>>>> This improves ipa-pta alias analysis in the parallelized function fn,
>>>> and
>>>> allows vectorization in the testcase without a runtime alias test.
>>>>
>>>> Bootstrapped and reg-tested on x86_64.
>>>>
>>>> OK for stage3 trunk?
>>>
>>> +             /* Assign the passed argument to the appropriate incoming
>>> +                parameter of the function.  */
>>> +             struct constraint_expr lhs ;
>>> +             lhs = get_function_part_constraint (fi, fi_parm_base + 0);
>>> +             auto_vec<ce_s, 2> rhsc;
>>> +             struct constraint_expr *rhsp;
>>> +             get_constraint_for_rhs (arg, &rhsc);
>>> +             while (rhsc.length () != 0)
>>> +               {
>>> +                 rhsp = &rhsc.last ();
>>> +                 process_constraint (new_constraint (lhs, *rhsp));
>>> +                 rhsc.pop ();
>>> +               }
>>>
>>> please use style used elsewhere with
>>>
>>>               FOR_EACH_VEC_ELT (rhsc, j, rhsp)
>>>                 process_constraint (new_constraint (lhs, *rhsp));
>>>               rhsc.truncate (0);
>>>
>>
>> That code was copied from find_func_aliases_for_call.
>> I've factored out the bit that I copied as
>> find_func_aliases_for_call_arg, and fixed the style there (and dropped
>> 'rhsc.truncate (0)' since AFAIU it's redundant at the end of a function).
>>
>>> +             /* Parameter passed by value is used.  */
>>> +             lhs = get_function_part_constraint (fi, fi_uses);
>>> +             struct constraint_expr *rhsp;
>>> +             get_constraint_for_address_of (arg, &rhsc);
>>>
>>> This isn't correct - you want to use get_constraint_for (arg, &rhsc).
>>> After all rhs is already an ADDR_EXPR.
>>>
>>
>> Can we add an assert somewhere to detect this incorrect usage?
>>
>>> +             FOR_EACH_VEC_ELT (rhsc, j, rhsp)
>>> +               process_constraint (new_constraint (lhs, *rhsp));
>>> +             rhsc.truncate (0);
>>> +
>>> +             /* The caller clobbers what the callee does.  */
>>> +             lhs = get_function_part_constraint (fi, fi_clobbers);
>>> +             rhs = get_function_part_constraint (cfi, fi_clobbers);
>>> +             process_constraint (new_constraint (lhs, rhs));
>>> +
>>> +             /* The caller uses what the callee does.  */
>>> +             lhs = get_function_part_constraint (fi, fi_uses);
>>> +             rhs = get_function_part_constraint (cfi, fi_uses);
>>> +             process_constraint (new_constraint (lhs, rhs));
>>>
>>> I don't see why you need those.  The solver should compute these
>>> in even better precision (context sensitive on the call side).
>>>
>>> The same is true for the function parameter.  That is, the only
>>> needed part of the patch should be that making sure we see
>>> the "direct" call and assign parameters correctly.
>>>
>>
>> Dropped this bit.
>
> Dropping the find_func_clobbers bit (in particular, the part copying the
> clobbers) caused PR68716.
>
> Basically the find_func_clobbers bit tries to emulate the generic
> handling of calls in find_func_clobbers, but pretends the call is
>    fn (data)
> when handling
>    __builtin_GOMP_parallel (fn, data, num_threads, flags)
> .
>
> I used the same comments as in the generic call handling to make it
> clear what part of the generic call handling is emulated.
>
> Compared to the originally posted patch, the changes are:
> - removed 'gcc_assert (TREE_CODE (arg) == ADDR_EXPR)'. Ran into a case
>    where arg is NULL.

And for that same reason, I've removed the same assert from 
find_func_aliases_for_builtin_call.

Committed to trunk as obvious.

Thanks,
- Tom

[-- Attachment #2: 0001-Remove-invalid-assert-in-find_func_aliases_for_builtin_call.patch --]
[-- Type: text/x-patch, Size: 827 bytes --]

Remove invalid assert in find_func_aliases_for_builtin_call

2015-12-10  Tom de Vries  <tom@codesourcery.com>

	* tree-ssa-structalias.c (find_func_aliases_for_builtin_call): Remove
	invalid assert.

---
 gcc/tree-ssa-structalias.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gcc/tree-ssa-structalias.c b/gcc/tree-ssa-structalias.c
index 15e351e..c350862 100644
--- a/gcc/tree-ssa-structalias.c
+++ b/gcc/tree-ssa-structalias.c
@@ -4533,7 +4533,6 @@ find_func_aliases_for_builtin_call (struct function *fn, gcall *t)
 	      gcc_assert (TREE_CODE (fnarg) == ADDR_EXPR);
 	      tree fndecl = TREE_OPERAND (fnarg, 0);
 	      tree arg = gimple_call_arg (t, argpos);
-	      gcc_assert (TREE_CODE (arg) == ADDR_EXPR);
 
 	      varinfo_t fi = get_vi_for_tree (fndecl);
 	      find_func_aliases_for_call_arg (fi, 0, arg);

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-03 11:09                     ` Tom de Vries
  2015-12-03 11:13                       ` Jakub Jelinek
  2015-12-03 11:13                       ` Richard Biener
@ 2015-12-10 13:15                       ` Tom de Vries
  2015-12-16 16:02                         ` Tom de Vries
  2 siblings, 1 reply; 44+ messages in thread
From: Tom de Vries @ 2015-12-10 13:15 UTC (permalink / raw)
  To: Richard Biener; +Cc: Thomas Schwinge, Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2958 bytes --]

[ copy-pasting-with-quote from 
https://gcc.gnu.org/ml/gcc-patches/2015-12/msg00420.html , for some 
reason I didn't get this email ]

> On Thu, 3 Dec 2015, Tom de Vries wrote:
>> The flag is set here in expand_omp_target:
>> ...
>> 12682         /* Prevent IPA from removing child_fn as unreachable,
>>                  since there are no
>> 12683            refs from the parent function to child_fn in offload
>>                  LTO mode.  */
>> 12684         if (ENABLE_OFFLOADING)
>> 12685           cgraph_node::get (child_fn)->mark_force_output ();
>> ...
>>
>
> How are there no refs from the "parent"?  Are there not refs from
> some kind of descriptor that maps fallback CPU and offloaded variants?

That descriptor is the offload table, which is emitted in 
omp_finish_file. The function iterates over vectors offload_vars and 
offload_funcs.

[ I would guess there's a one-on-one correspondance between 
symtab_node::offloadable and membership of either offload_vars or 
offload_funcs. ]

> I think the above needs sorting out in somw way, making the refs
> explicit rather than implicit via force_output.

I've tried an approach where I add a test for node->offloadable next to 
each test for node->force_output, except for the test in the nonlocal_p 
def in ipa_pta_execute. But I didn't (yet) manage to make that work.

>> I guess setting forced_by_abi instead would also mean child_fn is not removed
>> as unreachable, while still allowing optimizations:
>> ...
>>   /* Like FORCE_OUTPUT, but in the case it is ABI requiring the symbol
>>      to be exported.  Unlike FORCE_OUTPUT this flag gets cleared to
>>      symbols promoted to static and it does not inhibit
>>      optimization.  */
>>   unsigned forced_by_abi : 1;
>> ...
>>
>> But I suspect that other optimizations (than ipa-pta) might break things.
>
> How so?

Probably it's more accurate to say that I do not understand the 
difference very well between force_output and force_by_abi, and what is 
the class of optimizations enabled by using forced_by_abi instead of 
force_output.'

>> Essentially we have two situations:
>> - in the host compiler, there is no need for the forced_output flag,
>>   and it inhibits optimization
>> - in the accelerator compiler, it (or some equivalent) is needed

Actually, things are slightly more complicated, I realize now. There's 
also the distinction between:
- symbols declared as offloadable in the source code, and
- symbols create by the compiler and marked offloadable

>> I wonder if setting the force_output flag only when streaming the bytecode for
>> offloading would work. That way, it wouldn't be set in the host compiler,
>> while being set in the accelerator compiler.
>
> Yeah, that was my original thinking btw.

FTR, I've tried that approach, as attached. It fixed the 
goacc/kernels-alias-ipa-pta*.c failures. And I ran target-libgomp (also 
using an accelerator configuration) without any regressions.

Thanks,
- Tom


[-- Attachment #2: 0002-Set-force_output-in-offload-stream-if-offloadable.patch --]
[-- Type: text/x-patch, Size: 1456 bytes --]

Set force_output in offload stream if offloadable

---
 gcc/lto-cgraph.c | 2 +-
 gcc/omp-low.c    | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
index 62e5454..c862b19 100644
--- a/gcc/lto-cgraph.c
+++ b/gcc/lto-cgraph.c
@@ -511,7 +511,7 @@ lto_output_node (struct lto_simple_output_block *ob, struct cgraph_node *node,
   bp_pack_value (&bp, node->local.versionable, 1);
   bp_pack_value (&bp, node->local.can_change_signature, 1);
   bp_pack_value (&bp, node->local.redefined_extern_inline, 1);
-  bp_pack_value (&bp, node->force_output, 1);
+  bp_pack_value (&bp, node->force_output || node->offloadable, 1);
   bp_pack_value (&bp, node->forced_by_abi, 1);
   bp_pack_value (&bp, node->unique_name, 1);
   bp_pack_value (&bp, node->body_removed, 1);
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index 5643480..569cfd7 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -12770,10 +12770,12 @@ expand_omp_target (struct omp_region *region)
 	assign_assembler_name_if_neeeded (child_fn);
       cgraph_edge::rebuild_edges ();
 
+#if 0
       /* Prevent IPA from removing child_fn as unreachable, since there are no
 	 refs from the parent function to child_fn in offload LTO mode.  */
       if (ENABLE_OFFLOADING)
 	cgraph_node::get (child_fn)->mark_force_output ();
+#endif
 
       /* Some EH regions might become dead, see PR34608.  If
 	 pass_cleanup_cfg isn't the first pass to happen with the

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-10 13:15                       ` Tom de Vries
@ 2015-12-16 16:02                         ` Tom de Vries
  2016-01-05 14:56                           ` [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables Tom de Vries
  2016-01-08  8:17                           ` [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta Richard Biener
  0 siblings, 2 replies; 44+ messages in thread
From: Tom de Vries @ 2015-12-16 16:02 UTC (permalink / raw)
  To: Richard Biener; +Cc: Thomas Schwinge, Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3447 bytes --]

On 10/12/15 14:14, Tom de Vries wrote:
> [ copy-pasting-with-quote from
> https://gcc.gnu.org/ml/gcc-patches/2015-12/msg00420.html , for some
> reason I didn't get this email ]
>
>> On Thu, 3 Dec 2015, Tom de Vries wrote:
>>> The flag is set here in expand_omp_target:
>>> ...
>>> 12682         /* Prevent IPA from removing child_fn as unreachable,
>>>                  since there are no
>>> 12683            refs from the parent function to child_fn in offload
>>>                  LTO mode.  */
>>> 12684         if (ENABLE_OFFLOADING)
>>> 12685           cgraph_node::get (child_fn)->mark_force_output ();
>>> ...
>>>
>>
>> How are there no refs from the "parent"?  Are there not refs from
>> some kind of descriptor that maps fallback CPU and offloaded variants?
>
> That descriptor is the offload table, which is emitted in
> omp_finish_file. The function iterates over vectors offload_vars and
> offload_funcs.
>
> [ I would guess there's a one-on-one correspondance between
> symtab_node::offloadable and membership of either offload_vars or
> offload_funcs. ]
>
>> I think the above needs sorting out in somw way, making the refs
>> explicit rather than implicit via force_output.
>
> I've tried an approach where I add a test for node->offloadable next to
> each test for node->force_output, except for the test in the nonlocal_p
> def in ipa_pta_execute. But I didn't (yet) manage to make that work.
>
>>> I guess setting forced_by_abi instead would also mean child_fn is not
>>> removed
>>> as unreachable, while still allowing optimizations:
>>> ...
>>>   /* Like FORCE_OUTPUT, but in the case it is ABI requiring the symbol
>>>      to be exported.  Unlike FORCE_OUTPUT this flag gets cleared to
>>>      symbols promoted to static and it does not inhibit
>>>      optimization.  */
>>>   unsigned forced_by_abi : 1;
>>> ...
>>>
>>> But I suspect that other optimizations (than ipa-pta) might break
>>> things.
>>
>> How so?
>
> Probably it's more accurate to say that I do not understand the
> difference very well between force_output and force_by_abi, and what is
> the class of optimizations enabled by using forced_by_abi instead of
> force_output.'
>
>>> Essentially we have two situations:
>>> - in the host compiler, there is no need for the forced_output flag,
>>>   and it inhibits optimization
>>> - in the accelerator compiler, it (or some equivalent) is needed
>
> Actually, things are slightly more complicated, I realize now. There's
> also the distinction between:
> - symbols declared as offloadable in the source code, and
> - symbols create by the compiler and marked offloadable
>
>>> I wonder if setting the force_output flag only when streaming the
>>> bytecode for
>>> offloading would work. That way, it wouldn't be set in the host
>>> compiler,
>>> while being set in the accelerator compiler.
>>
>> Yeah, that was my original thinking btw.
>
> FTR, I've tried that approach, as attached. It fixed the
> goacc/kernels-alias-ipa-pta*.c failures. And I ran target-libgomp (also
> using an accelerator configuration) without any regressions.

How about this patch?

We remove the setting of force_output when:
- encountering offloadable symbols in the frontend, or
- creating offloadable symbols in expand-omp.

Instead, we set force_output in input_offload_tables.

This is an improvement because:
- it moves the force_output setting to a single location
- it does the force_output setting ALAP

Thanks,
- Tom

[-- Attachment #2: 0008-Mark-symbols-in-offload-tables-with-force_output-in-read_offload_tables.patch --]
[-- Type: text/x-patch, Size: 4197 bytes --]

Mark symbols in offload tables with force_output in read_offload_tables

2015-12-15  Tom de Vries  <tom@codesourcery.com>

	* c-parser.c (c_parser_oacc_declare, c_parser_omp_declare_target): Don't
	set force_output.

	* parser.c (cp_parser_oacc_declare, cp_parser_omp_declare_target): Don't
	set force_output.

	* omp-low.c (expand_omp_target): Don't set force_output.
	* varpool.c (varpool_node::get_create): Same.
	* lto-cgraph.c (input_offload_tables): Mark entries in offload_vars and
	offload_funcs with force_output.

---
 gcc/c/c-parser.c | 10 ++--------
 gcc/cp/parser.c  | 10 ++--------
 gcc/lto-cgraph.c |  9 +++++++++
 gcc/omp-low.c    |  5 -----
 gcc/varpool.c    |  1 -
 5 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 124c30b..6e6f4b8 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -13527,10 +13527,7 @@ c_parser_oacc_declare (c_parser *parser)
 		    {
 		      g->have_offload = true;
 		      if (is_a <varpool_node *> (node))
-			{
-			  vec_safe_push (offload_vars, decl);
-			  node->force_output = 1;
-			}
+			vec_safe_push (offload_vars, decl);
 		    }
 		}
 	    }
@@ -16412,10 +16409,7 @@ c_parser_omp_declare_target (c_parser *parser)
 		{
 		  g->have_offload = true;
 		  if (is_a <varpool_node *> (node))
-		    {
-		      vec_safe_push (offload_vars, t);
-		      node->force_output = 1;
-		    }
+		    vec_safe_push (offload_vars, t);
 		}
 	    }
 	}
diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
index a420cf1..340cc4a 100644
--- a/gcc/cp/parser.c
+++ b/gcc/cp/parser.c
@@ -35091,10 +35091,7 @@ cp_parser_oacc_declare (cp_parser *parser, cp_token *pragma_tok)
 		    {
 		      g->have_offload = true;
 		      if (is_a <varpool_node *> (node))
-			{
-			  vec_safe_push (offload_vars, decl);
-			  node->force_output = 1;
-			}
+			vec_safe_push (offload_vars, decl);
 		    }
 		}
 	    }
@@ -35631,10 +35628,7 @@ cp_parser_omp_declare_target (cp_parser *parser, cp_token *pragma_tok)
 		{
 		  g->have_offload = true;
 		  if (is_a <varpool_node *> (node))
-		    {
-		      vec_safe_push (offload_vars, t);
-		      node->force_output = 1;
-		    }
+		    vec_safe_push (offload_vars, t);
 		}
 	    }
 	}
diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
index 62e5454..cdaee41 100644
--- a/gcc/lto-cgraph.c
+++ b/gcc/lto-cgraph.c
@@ -1911,6 +1911,11 @@ input_offload_tables (void)
 	      tree fn_decl
 		= lto_file_decl_data_get_fn_decl (file_data, decl_index);
 	      vec_safe_push (offload_funcs, fn_decl);
+
+	      /* Prevent IPA from removing fn_decl as unreachable, since there
+		 may be no refs from the parent function to child_fn in offload
+		 LTO mode.  */
+	      cgraph_node::get (fn_decl)->mark_force_output ();
 	    }
 	  else if (tag == LTO_symtab_variable)
 	    {
@@ -1918,6 +1923,10 @@ input_offload_tables (void)
 	      tree var_decl
 		= lto_file_decl_data_get_var_decl (file_data, decl_index);
 	      vec_safe_push (offload_vars, var_decl);
+
+	      /* Prevent IPA from removing var_decl as unused, since there
+		 may be no refs to var_decl in offload LTO mode.  */
+	      varpool_node::get (var_decl)->force_output = 1;
 	    }
 	  else
 	    fatal_error (input_location,
diff --git a/gcc/omp-low.c b/gcc/omp-low.c
index e1d7c09..a76d8fc 100644
--- a/gcc/omp-low.c
+++ b/gcc/omp-low.c
@@ -12813,11 +12813,6 @@ expand_omp_target (struct omp_region *region)
 	assign_assembler_name_if_neeeded (child_fn);
       cgraph_edge::rebuild_edges ();
 
-      /* Prevent IPA from removing child_fn as unreachable, since there are no
-	 refs from the parent function to child_fn in offload LTO mode.  */
-      if (ENABLE_OFFLOADING)
-	cgraph_node::get (child_fn)->mark_force_output ();
-
       /* Some EH regions might become dead, see PR34608.  If
 	 pass_cleanup_cfg isn't the first pass to happen with the
 	 new child, these dead EH edges might cause problems.
diff --git a/gcc/varpool.c b/gcc/varpool.c
index 5e4fcbf..5ed65e5 100644
--- a/gcc/varpool.c
+++ b/gcc/varpool.c
@@ -158,7 +158,6 @@ varpool_node::get_create (tree decl)
 	  g->have_offload = true;
 	  if (!in_lto_p)
 	    vec_safe_push (offload_vars, decl);
-	  node->force_output = 1;
 	}
     }
 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables
  2015-12-16 16:02                         ` Tom de Vries
@ 2016-01-05 14:56                           ` Tom de Vries
  2016-01-25 13:27                             ` Ilya Verbin
  2016-01-08  8:17                           ` [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta Richard Biener
  1 sibling, 1 reply; 44+ messages in thread
From: Tom de Vries @ 2016-01-05 14:56 UTC (permalink / raw)
  To: Richard Biener; +Cc: Thomas Schwinge, Jakub Jelinek, gcc-patches

[ was: Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta ]

On 16/12/15 17:02, Tom de Vries wrote:
> On 10/12/15 14:14, Tom de Vries wrote:
>> [ copy-pasting-with-quote from
>> https://gcc.gnu.org/ml/gcc-patches/2015-12/msg00420.html , for some
>> reason I didn't get this email ]
>>
>>> On Thu, 3 Dec 2015, Tom de Vries wrote:
>>>> The flag is set here in expand_omp_target:
>>>> ...
>>>> 12682         /* Prevent IPA from removing child_fn as unreachable,
>>>>                  since there are no
>>>> 12683            refs from the parent function to child_fn in offload
>>>>                  LTO mode.  */
>>>> 12684         if (ENABLE_OFFLOADING)
>>>> 12685           cgraph_node::get (child_fn)->mark_force_output ();
>>>> ...
>>>>
>>>
>>> How are there no refs from the "parent"?  Are there not refs from
>>> some kind of descriptor that maps fallback CPU and offloaded variants?
>>
>> That descriptor is the offload table, which is emitted in
>> omp_finish_file. The function iterates over vectors offload_vars and
>> offload_funcs.
>>
>> [ I would guess there's a one-on-one correspondance between
>> symtab_node::offloadable and membership of either offload_vars or
>> offload_funcs. ]
>>
>>> I think the above needs sorting out in somw way, making the refs
>>> explicit rather than implicit via force_output.
>>
>> I've tried an approach where I add a test for node->offloadable next to
>> each test for node->force_output, except for the test in the nonlocal_p
>> def in ipa_pta_execute. But I didn't (yet) manage to make that work.
>>
>>>> I guess setting forced_by_abi instead would also mean child_fn is not
>>>> removed
>>>> as unreachable, while still allowing optimizations:
>>>> ...
>>>>   /* Like FORCE_OUTPUT, but in the case it is ABI requiring the symbol
>>>>      to be exported.  Unlike FORCE_OUTPUT this flag gets cleared to
>>>>      symbols promoted to static and it does not inhibit
>>>>      optimization.  */
>>>>   unsigned forced_by_abi : 1;
>>>> ...
>>>>
>>>> But I suspect that other optimizations (than ipa-pta) might break
>>>> things.
>>>
>>> How so?
>>
>> Probably it's more accurate to say that I do not understand the
>> difference very well between force_output and force_by_abi, and what is
>> the class of optimizations enabled by using forced_by_abi instead of
>> force_output.'
>>
>>>> Essentially we have two situations:
>>>> - in the host compiler, there is no need for the forced_output flag,
>>>>   and it inhibits optimization
>>>> - in the accelerator compiler, it (or some equivalent) is needed
>>
>> Actually, things are slightly more complicated, I realize now. There's
>> also the distinction between:
>> - symbols declared as offloadable in the source code, and
>> - symbols create by the compiler and marked offloadable
>>
>>>> I wonder if setting the force_output flag only when streaming the
>>>> bytecode for
>>>> offloading would work. That way, it wouldn't be set in the host
>>>> compiler,
>>>> while being set in the accelerator compiler.
>>>
>>> Yeah, that was my original thinking btw.
>>
>> FTR, I've tried that approach, as attached. It fixed the
>> goacc/kernels-alias-ipa-pta*.c failures. And I ran target-libgomp (also
>> using an accelerator configuration) without any regressions.
>
> How about this patch?
>

Ping.

Thanks,
- Tom

> We remove the setting of force_output when:
> - encountering offloadable symbols in the frontend, or
> - creating offloadable symbols in expand-omp.
>
> Instead, we set force_output in input_offload_tables.
>
> This is an improvement because:
> - it moves the force_output setting to a single location
> - it does the force_output setting ALAP
>
> Thanks,
> - Tom
>
> 0008-Mark-symbols-in-offload-tables-with-force_output-in-read_offload_tables.patch
>
>
> Mark symbols in offload tables with force_output in read_offload_tables
>
> 2015-12-15  Tom de Vries<tom@codesourcery.com>
>
> 	* c-parser.c (c_parser_oacc_declare, c_parser_omp_declare_target): Don't
> 	set force_output.
>
> 	* parser.c (cp_parser_oacc_declare, cp_parser_omp_declare_target): Don't
> 	set force_output.
>
> 	* omp-low.c (expand_omp_target): Don't set force_output.
> 	* varpool.c (varpool_node::get_create): Same.
> 	* lto-cgraph.c (input_offload_tables): Mark entries in offload_vars and
> 	offload_funcs with force_output.
>
> ---
>   gcc/c/c-parser.c | 10 ++--------
>   gcc/cp/parser.c  | 10 ++--------
>   gcc/lto-cgraph.c |  9 +++++++++
>   gcc/omp-low.c    |  5 -----
>   gcc/varpool.c    |  1 -
>   5 files changed, 13 insertions(+), 22 deletions(-)
>
> diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
> index 124c30b..6e6f4b8 100644
> --- a/gcc/c/c-parser.c
> +++ b/gcc/c/c-parser.c
> @@ -13527,10 +13527,7 @@ c_parser_oacc_declare (c_parser *parser)
>   		    {
>   		      g->have_offload = true;
>   		      if (is_a <varpool_node *> (node))
> -			{
> -			  vec_safe_push (offload_vars, decl);
> -			  node->force_output = 1;
> -			}
> +			vec_safe_push (offload_vars, decl);
>   		    }
>   		}
>   	    }
> @@ -16412,10 +16409,7 @@ c_parser_omp_declare_target (c_parser *parser)
>   		{
>   		  g->have_offload = true;
>   		  if (is_a <varpool_node *> (node))
> -		    {
> -		      vec_safe_push (offload_vars, t);
> -		      node->force_output = 1;
> -		    }
> +		    vec_safe_push (offload_vars, t);
>   		}
>   	    }
>   	}
> diff --git a/gcc/cp/parser.c b/gcc/cp/parser.c
> index a420cf1..340cc4a 100644
> --- a/gcc/cp/parser.c
> +++ b/gcc/cp/parser.c
> @@ -35091,10 +35091,7 @@ cp_parser_oacc_declare (cp_parser *parser, cp_token *pragma_tok)
>   		    {
>   		      g->have_offload = true;
>   		      if (is_a <varpool_node *> (node))
> -			{
> -			  vec_safe_push (offload_vars, decl);
> -			  node->force_output = 1;
> -			}
> +			vec_safe_push (offload_vars, decl);
>   		    }
>   		}
>   	    }
> @@ -35631,10 +35628,7 @@ cp_parser_omp_declare_target (cp_parser *parser, cp_token *pragma_tok)
>   		{
>   		  g->have_offload = true;
>   		  if (is_a <varpool_node *> (node))
> -		    {
> -		      vec_safe_push (offload_vars, t);
> -		      node->force_output = 1;
> -		    }
> +		    vec_safe_push (offload_vars, t);
>   		}
>   	    }
>   	}
> diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
> index 62e5454..cdaee41 100644
> --- a/gcc/lto-cgraph.c
> +++ b/gcc/lto-cgraph.c
> @@ -1911,6 +1911,11 @@ input_offload_tables (void)
>   	      tree fn_decl
>   		= lto_file_decl_data_get_fn_decl (file_data, decl_index);
>   	      vec_safe_push (offload_funcs, fn_decl);
> +
> +	      /* Prevent IPA from removing fn_decl as unreachable, since there
> +		 may be no refs from the parent function to child_fn in offload
> +		 LTO mode.  */
> +	      cgraph_node::get (fn_decl)->mark_force_output ();
>   	    }
>   	  else if (tag == LTO_symtab_variable)
>   	    {
> @@ -1918,6 +1923,10 @@ input_offload_tables (void)
>   	      tree var_decl
>   		= lto_file_decl_data_get_var_decl (file_data, decl_index);
>   	      vec_safe_push (offload_vars, var_decl);
> +
> +	      /* Prevent IPA from removing var_decl as unused, since there
> +		 may be no refs to var_decl in offload LTO mode.  */
> +	      varpool_node::get (var_decl)->force_output = 1;
>   	    }
>   	  else
>   	    fatal_error (input_location,
> diff --git a/gcc/omp-low.c b/gcc/omp-low.c
> index e1d7c09..a76d8fc 100644
> --- a/gcc/omp-low.c
> +++ b/gcc/omp-low.c
> @@ -12813,11 +12813,6 @@ expand_omp_target (struct omp_region *region)
>   	assign_assembler_name_if_neeeded (child_fn);
>         cgraph_edge::rebuild_edges ();
>
> -      /* Prevent IPA from removing child_fn as unreachable, since there are no
> -	 refs from the parent function to child_fn in offload LTO mode.  */
> -      if (ENABLE_OFFLOADING)
> -	cgraph_node::get (child_fn)->mark_force_output ();
> -
>         /* Some EH regions might become dead, see PR34608.  If
>   	 pass_cleanup_cfg isn't the first pass to happen with the
>   	 new child, these dead EH edges might cause problems.
> diff --git a/gcc/varpool.c b/gcc/varpool.c
> index 5e4fcbf..5ed65e5 100644
> --- a/gcc/varpool.c
> +++ b/gcc/varpool.c
> @@ -158,7 +158,6 @@ varpool_node::get_create (tree decl)
>   	  g->have_offload = true;
>   	  if (!in_lto_p)
>   	    vec_safe_push (offload_vars, decl);
> -	  node->force_output = 1;
>   	}
>       }
>
>



^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2015-12-16 16:02                         ` Tom de Vries
  2016-01-05 14:56                           ` [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables Tom de Vries
@ 2016-01-08  8:17                           ` Richard Biener
  2016-01-14  9:31                             ` Thomas Schwinge
  1 sibling, 1 reply; 44+ messages in thread
From: Richard Biener @ 2016-01-08  8:17 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Thomas Schwinge, Jakub Jelinek, gcc-patches

On Wed, 16 Dec 2015, Tom de Vries wrote:

> On 10/12/15 14:14, Tom de Vries wrote:
> > [ copy-pasting-with-quote from
> > https://gcc.gnu.org/ml/gcc-patches/2015-12/msg00420.html , for some
> > reason I didn't get this email ]
> > 
> > > On Thu, 3 Dec 2015, Tom de Vries wrote:
> > > > The flag is set here in expand_omp_target:
> > > > ...
> > > > 12682         /* Prevent IPA from removing child_fn as unreachable,
> > > >                  since there are no
> > > > 12683            refs from the parent function to child_fn in offload
> > > >                  LTO mode.  */
> > > > 12684         if (ENABLE_OFFLOADING)
> > > > 12685           cgraph_node::get (child_fn)->mark_force_output ();
> > > > ...
> > > > 
> > > 
> > > How are there no refs from the "parent"?  Are there not refs from
> > > some kind of descriptor that maps fallback CPU and offloaded variants?
> > 
> > That descriptor is the offload table, which is emitted in
> > omp_finish_file. The function iterates over vectors offload_vars and
> > offload_funcs.
> > 
> > [ I would guess there's a one-on-one correspondance between
> > symtab_node::offloadable and membership of either offload_vars or
> > offload_funcs. ]
> > 
> > > I think the above needs sorting out in somw way, making the refs
> > > explicit rather than implicit via force_output.
> > 
> > I've tried an approach where I add a test for node->offloadable next to
> > each test for node->force_output, except for the test in the nonlocal_p
> > def in ipa_pta_execute. But I didn't (yet) manage to make that work.
> > 
> > > > I guess setting forced_by_abi instead would also mean child_fn is not
> > > > removed
> > > > as unreachable, while still allowing optimizations:
> > > > ...
> > > >   /* Like FORCE_OUTPUT, but in the case it is ABI requiring the symbol
> > > >      to be exported.  Unlike FORCE_OUTPUT this flag gets cleared to
> > > >      symbols promoted to static and it does not inhibit
> > > >      optimization.  */
> > > >   unsigned forced_by_abi : 1;
> > > > ...
> > > > 
> > > > But I suspect that other optimizations (than ipa-pta) might break
> > > > things.
> > > 
> > > How so?
> > 
> > Probably it's more accurate to say that I do not understand the
> > difference very well between force_output and force_by_abi, and what is
> > the class of optimizations enabled by using forced_by_abi instead of
> > force_output.'
> > 
> > > > Essentially we have two situations:
> > > > - in the host compiler, there is no need for the forced_output flag,
> > > >   and it inhibits optimization
> > > > - in the accelerator compiler, it (or some equivalent) is needed
> > 
> > Actually, things are slightly more complicated, I realize now. There's
> > also the distinction between:
> > - symbols declared as offloadable in the source code, and
> > - symbols create by the compiler and marked offloadable
> > 
> > > > I wonder if setting the force_output flag only when streaming the
> > > > bytecode for
> > > > offloading would work. That way, it wouldn't be set in the host
> > > > compiler,
> > > > while being set in the accelerator compiler.
> > > 
> > > Yeah, that was my original thinking btw.
> > 
> > FTR, I've tried that approach, as attached. It fixed the
> > goacc/kernels-alias-ipa-pta*.c failures. And I ran target-libgomp (also
> > using an accelerator configuration) without any regressions.
> 
> How about this patch?
> 
> We remove the setting of force_output when:
> - encountering offloadable symbols in the frontend, or
> - creating offloadable symbols in expand-omp.
> 
> Instead, we set force_output in input_offload_tables.
> 
> This is an improvement because:
> - it moves the force_output setting to a single location
> - it does the force_output setting ALAP

Works for me.

Richard.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2016-01-08  8:17                           ` [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta Richard Biener
@ 2016-01-14  9:31                             ` Thomas Schwinge
  2016-01-14  9:32                               ` Richard Biener
  0 siblings, 1 reply; 44+ messages in thread
From: Thomas Schwinge @ 2016-01-14  9:31 UTC (permalink / raw)
  To: Jakub Jelinek, Richard Biener, Tom de Vries; +Cc: gcc-patches

Hi!

On Fri, 8 Jan 2016 09:16:55 +0100, Richard Biener <rguenther@suse.de> wrote:
> On Wed, 16 Dec 2015, Tom de Vries wrote:
> > On 10/12/15 14:14, Tom de Vries wrote:
> > > > On Thu, 3 Dec 2015, Tom de Vries wrote:
> > > > > Essentially we have two situations:
> > > > > - in the host compiler, there is no need for the forced_output flag,
> > > > >   and it inhibits optimization
> > > > > - in the accelerator compiler, it (or some equivalent) is needed
> > > 
> > > Actually, things are slightly more complicated, I realize now. There's
> > > also the distinction between:
> > > - symbols declared as offloadable in the source code, and
> > > - symbols create by the compiler and marked offloadable
> > > 
> > > > > I wonder if setting the force_output flag only when streaming the
> > > > > bytecode for
> > > > > offloading would work. That way, it wouldn't be set in the host
> > > > > compiler,
> > > > > while being set in the accelerator compiler.
> > > > 
> > > > Yeah, that was my original thinking btw.
> > > 
> > > FTR, I've tried that approach, as attached. It fixed the
> > > goacc/kernels-alias-ipa-pta*.c failures.

Confirmed.  And, no change in offloading testing when applying the patch
to gomp-4_0-branch (where these FAILs didn't appear to begin with).

> > > And I ran target-libgomp (also
> > > using an accelerator configuration) without any regressions.

Confirmed.

> > How about this patch?
> > 
> > We remove the setting of force_output when:
> > - encountering offloadable symbols in the frontend, or
> > - creating offloadable symbols in expand-omp.
> > 
> > Instead, we set force_output in input_offload_tables.
> > 
> > This is an improvement because:
> > - it moves the force_output setting to a single location
> > - it does the force_output setting ALAP
> 
> Works for me.

Do we need review from Jakub or is this good for trunk?


Grüße
 Thomas

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta
  2016-01-14  9:31                             ` Thomas Schwinge
@ 2016-01-14  9:32                               ` Richard Biener
  0 siblings, 0 replies; 44+ messages in thread
From: Richard Biener @ 2016-01-14  9:32 UTC (permalink / raw)
  To: Thomas Schwinge; +Cc: Jakub Jelinek, Tom de Vries, gcc-patches

On Thu, 14 Jan 2016, Thomas Schwinge wrote:

> Hi!
> 
> On Fri, 8 Jan 2016 09:16:55 +0100, Richard Biener <rguenther@suse.de> wrote:
> > On Wed, 16 Dec 2015, Tom de Vries wrote:
> > > On 10/12/15 14:14, Tom de Vries wrote:
> > > > > On Thu, 3 Dec 2015, Tom de Vries wrote:
> > > > > > Essentially we have two situations:
> > > > > > - in the host compiler, there is no need for the forced_output flag,
> > > > > >   and it inhibits optimization
> > > > > > - in the accelerator compiler, it (or some equivalent) is needed
> > > > 
> > > > Actually, things are slightly more complicated, I realize now. There's
> > > > also the distinction between:
> > > > - symbols declared as offloadable in the source code, and
> > > > - symbols create by the compiler and marked offloadable
> > > > 
> > > > > > I wonder if setting the force_output flag only when streaming the
> > > > > > bytecode for
> > > > > > offloading would work. That way, it wouldn't be set in the host
> > > > > > compiler,
> > > > > > while being set in the accelerator compiler.
> > > > > 
> > > > > Yeah, that was my original thinking btw.
> > > > 
> > > > FTR, I've tried that approach, as attached. It fixed the
> > > > goacc/kernels-alias-ipa-pta*.c failures.
> 
> Confirmed.  And, no change in offloading testing when applying the patch
> to gomp-4_0-branch (where these FAILs didn't appear to begin with).
> 
> > > > And I ran target-libgomp (also
> > > > using an accelerator configuration) without any regressions.
> 
> Confirmed.
> 
> > > How about this patch?
> > > 
> > > We remove the setting of force_output when:
> > > - encountering offloadable symbols in the frontend, or
> > > - creating offloadable symbols in expand-omp.
> > > 
> > > Instead, we set force_output in input_offload_tables.
> > > 
> > > This is an improvement because:
> > > - it moves the force_output setting to a single location
> > > - it does the force_output setting ALAP
> > 
> > Works for me.
> 
> Do we need review from Jakub or is this good for trunk?

It's good for trunk.

Richard.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables
  2016-01-05 14:56                           ` [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables Tom de Vries
@ 2016-01-25 13:27                             ` Ilya Verbin
  2016-01-26 12:22                               ` Tom de Vries
  0 siblings, 1 reply; 44+ messages in thread
From: Ilya Verbin @ 2016-01-25 13:27 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Biener, Thomas Schwinge, Jakub Jelinek, gcc-patches

Hi!

On Tue, Jan 05, 2016 at 15:56:15 +0100, Tom de Vries wrote:
> >diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
> >index 62e5454..cdaee41 100644
> >--- a/gcc/lto-cgraph.c
> >+++ b/gcc/lto-cgraph.c
> >@@ -1911,6 +1911,11 @@ input_offload_tables (void)
> >  	      tree fn_decl
> >  		= lto_file_decl_data_get_fn_decl (file_data, decl_index);
> >  	      vec_safe_push (offload_funcs, fn_decl);
> >+
> >+	      /* Prevent IPA from removing fn_decl as unreachable, since there
> >+		 may be no refs from the parent function to child_fn in offload
> >+		 LTO mode.  */
> >+	      cgraph_node::get (fn_decl)->mark_force_output ();
> >  	    }
> >  	  else if (tag == LTO_symtab_variable)
> >  	    {
> >@@ -1918,6 +1923,10 @@ input_offload_tables (void)
> >  	      tree var_decl
> >  		= lto_file_decl_data_get_var_decl (file_data, decl_index);
> >  	      vec_safe_push (offload_vars, var_decl);
> >+
> >+	      /* Prevent IPA from removing var_decl as unused, since there
> >+		 may be no refs to var_decl in offload LTO mode.  */
> >+	      varpool_node::get (var_decl)->force_output = 1;
> >  	    }

This doesn't work when there is more than one LTO partition, because only first
partition contains full offload table to maintain correct order, but cgraph and
varpool nodes aren't necessarily created for the first partition.  To reproduce:

$ make check-target-libgomp RUNTESTFLAGS="c.exp=for-* --target_board=unix/-flto"
FAIL: libgomp.c/for-3.c (internal compiler error)
FAIL: libgomp.c/for-5.c (internal compiler error)
FAIL: libgomp.c/for-6.c (internal compiler error)
$ make check-target-libgomp RUNTESTFLAGS="c++.exp=for-* --target_board=unix/-flto"
FAIL: libgomp.c++/for-11.C (internal compiler error)
FAIL: libgomp.c++/for-13.C (internal compiler error)
FAIL: libgomp.c++/for-14.C (internal compiler error)

  -- Ilya

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables
  2016-01-25 13:27                             ` Ilya Verbin
@ 2016-01-26 12:22                               ` Tom de Vries
  2016-01-26 13:02                                 ` Ilya Verbin
  2016-01-26 14:13                                 ` [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables Richard Biener
  0 siblings, 2 replies; 44+ messages in thread
From: Tom de Vries @ 2016-01-26 12:22 UTC (permalink / raw)
  To: Ilya Verbin; +Cc: Richard Biener, Thomas Schwinge, Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1964 bytes --]

On 25/01/16 14:27, Ilya Verbin wrote:
> Hi!
>
> On Tue, Jan 05, 2016 at 15:56:15 +0100, Tom de Vries wrote:
>>> diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
>>> index 62e5454..cdaee41 100644
>>> --- a/gcc/lto-cgraph.c
>>> +++ b/gcc/lto-cgraph.c
>>> @@ -1911,6 +1911,11 @@ input_offload_tables (void)
>>>   	      tree fn_decl
>>>   		= lto_file_decl_data_get_fn_decl (file_data, decl_index);
>>>   	      vec_safe_push (offload_funcs, fn_decl);
>>> +
>>> +	      /* Prevent IPA from removing fn_decl as unreachable, since there
>>> +		 may be no refs from the parent function to child_fn in offload
>>> +		 LTO mode.  */
>>> +	      cgraph_node::get (fn_decl)->mark_force_output ();
>>>   	    }
>>>   	  else if (tag == LTO_symtab_variable)
>>>   	    {
>>> @@ -1918,6 +1923,10 @@ input_offload_tables (void)
>>>   	      tree var_decl
>>>   		= lto_file_decl_data_get_var_decl (file_data, decl_index);
>>>   	      vec_safe_push (offload_vars, var_decl);
>>> +
>>> +	      /* Prevent IPA from removing var_decl as unused, since there
>>> +		 may be no refs to var_decl in offload LTO mode.  */
>>> +	      varpool_node::get (var_decl)->force_output = 1;
>>>   	    }
>
> This doesn't work when there is more than one LTO partition, because only first
> partition contains full offload table to maintain correct order, but cgraph and
> varpool nodes aren't necessarily created for the first partition.  To reproduce:
>
> $ make check-target-libgomp RUNTESTFLAGS="c.exp=for-* --target_board=unix/-flto"
> FAIL: libgomp.c/for-3.c (internal compiler error)
> FAIL: libgomp.c/for-5.c (internal compiler error)
> FAIL: libgomp.c/for-6.c (internal compiler error)
> $ make check-target-libgomp RUNTESTFLAGS="c++.exp=for-* --target_board=unix/-flto"
> FAIL: libgomp.c++/for-11.C (internal compiler error)
> FAIL: libgomp.c++/for-13.C (internal compiler error)
> FAIL: libgomp.c++/for-14.C (internal compiler error)

This works for me.

OK for trunk?

Thanks,
- Tom


[-- Attachment #2: 0001-Check-that-cgraph-varpool_node-exists-before-use-in-input_offload_tables.patch --]
[-- Type: text/x-patch, Size: 1243 bytes --]

Check that cgraph/varpool_node exists before use in input_offload_tables

2016-01-26  Tom de Vries  <tom@codesourcery.com>

	* lto-cgraph.c (input_offload_tables): Check that cgraph/varpool_node
	exists before use.

---
 gcc/lto-cgraph.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
index 0634779..f4bcbaa 100644
--- a/gcc/lto-cgraph.c
+++ b/gcc/lto-cgraph.c
@@ -1915,7 +1915,9 @@ input_offload_tables (void)
 	      /* Prevent IPA from removing fn_decl as unreachable, since there
 		 may be no refs from the parent function to child_fn in offload
 		 LTO mode.  */
-	      cgraph_node::get (fn_decl)->mark_force_output ();
+	      cgraph_node *node = cgraph_node::get (fn_decl);
+	      if (node)
+		node->mark_force_output ();
 	    }
 	  else if (tag == LTO_symtab_variable)
 	    {
@@ -1926,7 +1928,9 @@ input_offload_tables (void)
 
 	      /* Prevent IPA from removing var_decl as unused, since there
 		 may be no refs to var_decl in offload LTO mode.  */
-	      varpool_node::get (var_decl)->force_output = 1;
+	      varpool_node *node = varpool_node::get (var_decl);
+	      if (node)
+		node->force_output = 1;
 	    }
 	  else
 	    fatal_error (input_location,

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables
  2016-01-26 12:22                               ` Tom de Vries
@ 2016-01-26 13:02                                 ` Ilya Verbin
  2016-02-08 13:20                                   ` Tom de Vries
  2016-01-26 14:13                                 ` [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables Richard Biener
  1 sibling, 1 reply; 44+ messages in thread
From: Ilya Verbin @ 2016-01-26 13:02 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Biener, Thomas Schwinge, Jakub Jelinek, gcc-patches

On Tue, Jan 26, 2016 at 13:21:57 +0100, Tom de Vries wrote:
> On 25/01/16 14:27, Ilya Verbin wrote:
> >On Tue, Jan 05, 2016 at 15:56:15 +0100, Tom de Vries wrote:
> >>>diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
> >>>index 62e5454..cdaee41 100644
> >>>--- a/gcc/lto-cgraph.c
> >>>+++ b/gcc/lto-cgraph.c
> >>>@@ -1911,6 +1911,11 @@ input_offload_tables (void)
> >>>  	      tree fn_decl
> >>>  		= lto_file_decl_data_get_fn_decl (file_data, decl_index);
> >>>  	      vec_safe_push (offload_funcs, fn_decl);
> >>>+
> >>>+	      /* Prevent IPA from removing fn_decl as unreachable, since there
> >>>+		 may be no refs from the parent function to child_fn in offload
> >>>+		 LTO mode.  */
> >>>+	      cgraph_node::get (fn_decl)->mark_force_output ();
> >>>  	    }
> >>>  	  else if (tag == LTO_symtab_variable)
> >>>  	    {
> >>>@@ -1918,6 +1923,10 @@ input_offload_tables (void)
> >>>  	      tree var_decl
> >>>  		= lto_file_decl_data_get_var_decl (file_data, decl_index);
> >>>  	      vec_safe_push (offload_vars, var_decl);
> >>>+
> >>>+	      /* Prevent IPA from removing var_decl as unused, since there
> >>>+		 may be no refs to var_decl in offload LTO mode.  */
> >>>+	      varpool_node::get (var_decl)->force_output = 1;
> >>>  	    }
> >
> >This doesn't work when there is more than one LTO partition, because only first
> >partition contains full offload table to maintain correct order, but cgraph and
> >varpool nodes aren't necessarily created for the first partition.  To reproduce:
> >
> >$ make check-target-libgomp RUNTESTFLAGS="c.exp=for-* --target_board=unix/-flto"
> >FAIL: libgomp.c/for-3.c (internal compiler error)
> >FAIL: libgomp.c/for-5.c (internal compiler error)
> >FAIL: libgomp.c/for-6.c (internal compiler error)
> >$ make check-target-libgomp RUNTESTFLAGS="c++.exp=for-* --target_board=unix/-flto"
> >FAIL: libgomp.c++/for-11.C (internal compiler error)
> >FAIL: libgomp.c++/for-13.C (internal compiler error)
> >FAIL: libgomp.c++/for-14.C (internal compiler error)
> 
> This works for me.
> 
> OK for trunk?
> 
> Thanks,
> - Tom
> 

> Check that cgraph/varpool_node exists before use in input_offload_tables
> 
> 2016-01-26  Tom de Vries  <tom@codesourcery.com>
> 
> 	* lto-cgraph.c (input_offload_tables): Check that cgraph/varpool_node
> 	exists before use.

In this case they will be not marked as force_output in other partitions (except
the first one).

  -- Ilya

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables
  2016-01-26 12:22                               ` Tom de Vries
  2016-01-26 13:02                                 ` Ilya Verbin
@ 2016-01-26 14:13                                 ` Richard Biener
  1 sibling, 0 replies; 44+ messages in thread
From: Richard Biener @ 2016-01-26 14:13 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Ilya Verbin, Thomas Schwinge, Jakub Jelinek, gcc-patches

On Tue, 26 Jan 2016, Tom de Vries wrote:

> On 25/01/16 14:27, Ilya Verbin wrote:
> > Hi!
> > 
> > On Tue, Jan 05, 2016 at 15:56:15 +0100, Tom de Vries wrote:
> > > > diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
> > > > index 62e5454..cdaee41 100644
> > > > --- a/gcc/lto-cgraph.c
> > > > +++ b/gcc/lto-cgraph.c
> > > > @@ -1911,6 +1911,11 @@ input_offload_tables (void)
> > > >   	      tree fn_decl
> > > >   		= lto_file_decl_data_get_fn_decl (file_data,
> > > > decl_index);
> > > >   	      vec_safe_push (offload_funcs, fn_decl);
> > > > +
> > > > +	      /* Prevent IPA from removing fn_decl as unreachable, since there
> > > > +		 may be no refs from the parent function to child_fn in
> > > > offload
> > > > +		 LTO mode.  */
> > > > +	      cgraph_node::get (fn_decl)->mark_force_output ();
> > > >   	    }
> > > >   	  else if (tag == LTO_symtab_variable)
> > > >   	    {
> > > > @@ -1918,6 +1923,10 @@ input_offload_tables (void)
> > > >   	      tree var_decl
> > > >   		= lto_file_decl_data_get_var_decl (file_data,
> > > > decl_index);
> > > >   	      vec_safe_push (offload_vars, var_decl);
> > > > +
> > > > +	      /* Prevent IPA from removing var_decl as unused, since there
> > > > +		 may be no refs to var_decl in offload LTO mode.  */
> > > > +	      varpool_node::get (var_decl)->force_output = 1;
> > > >   	    }
> > 
> > This doesn't work when there is more than one LTO partition, because only
> > first
> > partition contains full offload table to maintain correct order, but cgraph
> > and
> > varpool nodes aren't necessarily created for the first partition.  To
> > reproduce:
> > 
> > $ make check-target-libgomp RUNTESTFLAGS="c.exp=for-*
> > --target_board=unix/-flto"
> > FAIL: libgomp.c/for-3.c (internal compiler error)
> > FAIL: libgomp.c/for-5.c (internal compiler error)
> > FAIL: libgomp.c/for-6.c (internal compiler error)
> > $ make check-target-libgomp RUNTESTFLAGS="c++.exp=for-*
> > --target_board=unix/-flto"
> > FAIL: libgomp.c++/for-11.C (internal compiler error)
> > FAIL: libgomp.c++/for-13.C (internal compiler error)
> > FAIL: libgomp.c++/for-14.C (internal compiler error)
> 
> This works for me.
> 
> OK for trunk?

Ok.

Thanks,
Richard.

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables
  2016-01-26 13:02                                 ` Ilya Verbin
@ 2016-02-08 13:20                                   ` Tom de Vries
  2016-02-08 15:02                                     ` Ilya Verbin
  2016-02-15  7:38                                     ` [PING][PATCH] Don't mark offload symbols with force_output in ltrans Tom de Vries
  0 siblings, 2 replies; 44+ messages in thread
From: Tom de Vries @ 2016-02-08 13:20 UTC (permalink / raw)
  To: Ilya Verbin, Richard Biener; +Cc: Thomas Schwinge, Jakub Jelinek, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 3084 bytes --]

On 26/01/16 14:01, Ilya Verbin wrote:
> On Tue, Jan 26, 2016 at 13:21:57 +0100, Tom de Vries wrote:
>> On 25/01/16 14:27, Ilya Verbin wrote:
>>> On Tue, Jan 05, 2016 at 15:56:15 +0100, Tom de Vries wrote:
>>>>> diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
>>>>> index 62e5454..cdaee41 100644
>>>>> --- a/gcc/lto-cgraph.c
>>>>> +++ b/gcc/lto-cgraph.c
>>>>> @@ -1911,6 +1911,11 @@ input_offload_tables (void)
>>>>>   	      tree fn_decl
>>>>>   		= lto_file_decl_data_get_fn_decl (file_data, decl_index);
>>>>>   	      vec_safe_push (offload_funcs, fn_decl);
>>>>> +
>>>>> +	      /* Prevent IPA from removing fn_decl as unreachable, since there
>>>>> +		 may be no refs from the parent function to child_fn in offload
>>>>> +		 LTO mode.  */
>>>>> +	      cgraph_node::get (fn_decl)->mark_force_output ();
>>>>>   	    }
>>>>>   	  else if (tag == LTO_symtab_variable)
>>>>>   	    {
>>>>> @@ -1918,6 +1923,10 @@ input_offload_tables (void)
>>>>>   	      tree var_decl
>>>>>   		= lto_file_decl_data_get_var_decl (file_data, decl_index);
>>>>>   	      vec_safe_push (offload_vars, var_decl);
>>>>> +
>>>>> +	      /* Prevent IPA from removing var_decl as unused, since there
>>>>> +		 may be no refs to var_decl in offload LTO mode.  */
>>>>> +	      varpool_node::get (var_decl)->force_output = 1;
>>>>>   	    }
>>>
>>> This doesn't work when there is more than one LTO partition, because only first
>>> partition contains full offload table to maintain correct order, but cgraph and
>>> varpool nodes aren't necessarily created for the first partition.  To reproduce:
>>>
>>> $ make check-target-libgomp RUNTESTFLAGS="c.exp=for-* --target_board=unix/-flto"
>>> FAIL: libgomp.c/for-3.c (internal compiler error)
>>> FAIL: libgomp.c/for-5.c (internal compiler error)
>>> FAIL: libgomp.c/for-6.c (internal compiler error)
>>> $ make check-target-libgomp RUNTESTFLAGS="c++.exp=for-* --target_board=unix/-flto"
>>> FAIL: libgomp.c++/for-11.C (internal compiler error)
>>> FAIL: libgomp.c++/for-13.C (internal compiler error)
>>> FAIL: libgomp.c++/for-14.C (internal compiler error)
>>
>> This works for me.
>>
>> OK for trunk?
>>
>> Thanks,
>> - Tom
>>
>
>> Check that cgraph/varpool_node exists before use in input_offload_tables
>>
>> 2016-01-26  Tom de Vries  <tom@codesourcery.com>
>>
>> 	* lto-cgraph.c (input_offload_tables): Check that cgraph/varpool_node
>> 	exists before use.
>
> In this case they will be not marked as force_output in other partitions (except
> the first one).

AFAIU, that's not the case.

If we're splitting up lto compilation over partitions, it means we're 
first calling lto1 in WPA mode. We'll read in all offload tables, and 
mark all symbols with force_output, and when writing out the partitions, 
we'll write the offload symbols out with force_output set.

This updated patch only does the force_output marking for offload 
symbols in WPA or LTO. It's not necessary in LTRANS mode.

Bootstrapped and reg-tested on x86_64.

Build for nvidia accelerator and reg-tested libgomp with various lto 
settings.

OK for trunk, stage4?

Thanks,
- Tom


[-- Attachment #2: 0006-Don-t-mark-offload-symbols-with-force_output-in-ltrans.patch --]
[-- Type: text/x-patch, Size: 2627 bytes --]

Don't mark offload symbols with force_output in ltrans

2016-02-08  Tom de Vries  <tom@codesourcery.com>

	PR lto/69655
	* lto-cgraph.c (input_offload_tables): Add and handle bool parameter
	do_force_output.
	* lto-streamer.h (input_offload_tables): Add and handle bool parameter.

	* lto.c (read_cgraph_and_symbols): Call input_offload_tables with
	argument.

---
 gcc/lto-cgraph.c   | 8 +++++---
 gcc/lto-streamer.h | 2 +-
 gcc/lto/lto.c      | 2 +-
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
index 0634779..95c446d 100644
--- a/gcc/lto-cgraph.c
+++ b/gcc/lto-cgraph.c
@@ -1885,7 +1885,7 @@ input_symtab (void)
    target code, and store them into OFFLOAD_FUNCS and OFFLOAD_VARS.  */
 
 void
-input_offload_tables (void)
+input_offload_tables (bool do_force_output)
 {
   struct lto_file_decl_data **file_data_vec = lto_get_file_decl_data ();
   struct lto_file_decl_data *file_data;
@@ -1915,7 +1915,8 @@ input_offload_tables (void)
 	      /* Prevent IPA from removing fn_decl as unreachable, since there
 		 may be no refs from the parent function to child_fn in offload
 		 LTO mode.  */
-	      cgraph_node::get (fn_decl)->mark_force_output ();
+	      if (do_force_output)
+		cgraph_node::get (fn_decl)->mark_force_output ();
 	    }
 	  else if (tag == LTO_symtab_variable)
 	    {
@@ -1926,7 +1927,8 @@ input_offload_tables (void)
 
 	      /* Prevent IPA from removing var_decl as unused, since there
 		 may be no refs to var_decl in offload LTO mode.  */
-	      varpool_node::get (var_decl)->force_output = 1;
+	      if (do_force_output)
+		varpool_node::get (var_decl)->force_output = 1;
 	    }
 	  else
 	    fatal_error (input_location,
diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h
index 0cb200e..f391161 100644
--- a/gcc/lto-streamer.h
+++ b/gcc/lto-streamer.h
@@ -915,7 +915,7 @@ bool lto_symtab_encoder_encode_initializer_p (lto_symtab_encoder_t,
 void output_symtab (void);
 void input_symtab (void);
 void output_offload_tables (void);
-void input_offload_tables (void);
+void input_offload_tables (bool);
 bool referenced_from_other_partition_p (struct ipa_ref_list *,
 				        lto_symtab_encoder_t);
 bool reachable_from_other_partition_p (struct cgraph_node *,
diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
index ecec1bc..2736c5c 100644
--- a/gcc/lto/lto.c
+++ b/gcc/lto/lto.c
@@ -2855,7 +2855,7 @@ read_cgraph_and_symbols (unsigned nfiles, const char **fnames)
   /* Read the symtab.  */
   input_symtab ();
 
-  input_offload_tables ();
+  input_offload_tables (!flag_ltrans);
 
   /* Store resolutions into the symbol table.  */
 

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables
  2016-02-08 13:20                                   ` Tom de Vries
@ 2016-02-08 15:02                                     ` Ilya Verbin
  2016-02-15  7:38                                     ` [PING][PATCH] Don't mark offload symbols with force_output in ltrans Tom de Vries
  1 sibling, 0 replies; 44+ messages in thread
From: Ilya Verbin @ 2016-02-08 15:02 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Richard Biener, Thomas Schwinge, Jakub Jelinek, gcc-patches

On Mon, Feb 08, 2016 at 14:20:11 +0100, Tom de Vries wrote:
> On 26/01/16 14:01, Ilya Verbin wrote:
> >On Tue, Jan 26, 2016 at 13:21:57 +0100, Tom de Vries wrote:
> >>On 25/01/16 14:27, Ilya Verbin wrote:
> >>>On Tue, Jan 05, 2016 at 15:56:15 +0100, Tom de Vries wrote:
> >>>>>diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
> >>>>>index 62e5454..cdaee41 100644
> >>>>>--- a/gcc/lto-cgraph.c
> >>>>>+++ b/gcc/lto-cgraph.c
> >>>>>@@ -1911,6 +1911,11 @@ input_offload_tables (void)
> >>>>>  	      tree fn_decl
> >>>>>  		= lto_file_decl_data_get_fn_decl (file_data, decl_index);
> >>>>>  	      vec_safe_push (offload_funcs, fn_decl);
> >>>>>+
> >>>>>+	      /* Prevent IPA from removing fn_decl as unreachable, since there
> >>>>>+		 may be no refs from the parent function to child_fn in offload
> >>>>>+		 LTO mode.  */
> >>>>>+	      cgraph_node::get (fn_decl)->mark_force_output ();
> >>>>>  	    }
> >>>>>  	  else if (tag == LTO_symtab_variable)
> >>>>>  	    {
> >>>>>@@ -1918,6 +1923,10 @@ input_offload_tables (void)
> >>>>>  	      tree var_decl
> >>>>>  		= lto_file_decl_data_get_var_decl (file_data, decl_index);
> >>>>>  	      vec_safe_push (offload_vars, var_decl);
> >>>>>+
> >>>>>+	      /* Prevent IPA from removing var_decl as unused, since there
> >>>>>+		 may be no refs to var_decl in offload LTO mode.  */
> >>>>>+	      varpool_node::get (var_decl)->force_output = 1;
> >>>>>  	    }
> >>>
> >>>This doesn't work when there is more than one LTO partition, because only first
> >>>partition contains full offload table to maintain correct order, but cgraph and
> >>>varpool nodes aren't necessarily created for the first partition.  To reproduce:
> >>>
> >>>$ make check-target-libgomp RUNTESTFLAGS="c.exp=for-* --target_board=unix/-flto"
> >>>FAIL: libgomp.c/for-3.c (internal compiler error)
> >>>FAIL: libgomp.c/for-5.c (internal compiler error)
> >>>FAIL: libgomp.c/for-6.c (internal compiler error)
> >>>$ make check-target-libgomp RUNTESTFLAGS="c++.exp=for-* --target_board=unix/-flto"
> >>>FAIL: libgomp.c++/for-11.C (internal compiler error)
> >>>FAIL: libgomp.c++/for-13.C (internal compiler error)
> >>>FAIL: libgomp.c++/for-14.C (internal compiler error)
> >>
> >>This works for me.
> >>
> >>OK for trunk?
> >>
> >>Thanks,
> >>- Tom
> >>
> >
> >>Check that cgraph/varpool_node exists before use in input_offload_tables
> >>
> >>2016-01-26  Tom de Vries  <tom@codesourcery.com>
> >>
> >>	* lto-cgraph.c (input_offload_tables): Check that cgraph/varpool_node
> >>	exists before use.
> >
> >In this case they will be not marked as force_output in other partitions (except
> >the first one).
> 
> AFAIU, that's not the case.
> 
> If we're splitting up lto compilation over partitions, it means we're first
> calling lto1 in WPA mode. We'll read in all offload tables, and mark all
> symbols with force_output, and when writing out the partitions, we'll write
> the offload symbols out with force_output set.
> 
> This updated patch only does the force_output marking for offload symbols in
> WPA or LTO. It's not necessary in LTRANS mode.

You're right, works for me.

  -- Ilya

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [PING][PATCH] Don't mark offload symbols with force_output in ltrans
  2016-02-08 13:20                                   ` Tom de Vries
  2016-02-08 15:02                                     ` Ilya Verbin
@ 2016-02-15  7:38                                     ` Tom de Vries
  2016-02-15  9:07                                       ` Richard Biener
  1 sibling, 1 reply; 44+ messages in thread
From: Tom de Vries @ 2016-02-15  7:38 UTC (permalink / raw)
  To: Richard Biener; +Cc: Ilya Verbin, Thomas Schwinge, Jakub Jelinek, gcc-patches

[ was: [PING][PATCH] Mark symbols in offload tables with force_output in 
read_offload_tables ]

On 08/02/16 14:20, Tom de Vries wrote:
> On 26/01/16 14:01, Ilya Verbin wrote:
>> On Tue, Jan 26, 2016 at 13:21:57 +0100, Tom de Vries wrote:
>>> On 25/01/16 14:27, Ilya Verbin wrote:
>>>> On Tue, Jan 05, 2016 at 15:56:15 +0100, Tom de Vries wrote:
>>>>>> diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
>>>>>> index 62e5454..cdaee41 100644
>>>>>> --- a/gcc/lto-cgraph.c
>>>>>> +++ b/gcc/lto-cgraph.c
>>>>>> @@ -1911,6 +1911,11 @@ input_offload_tables (void)
>>>>>>             tree fn_decl
>>>>>>           = lto_file_decl_data_get_fn_decl (file_data, decl_index);
>>>>>>             vec_safe_push (offload_funcs, fn_decl);
>>>>>> +
>>>>>> +          /* Prevent IPA from removing fn_decl as unreachable,
>>>>>> since there
>>>>>> +         may be no refs from the parent function to child_fn in
>>>>>> offload
>>>>>> +         LTO mode.  */
>>>>>> +          cgraph_node::get (fn_decl)->mark_force_output ();
>>>>>>           }
>>>>>>         else if (tag == LTO_symtab_variable)
>>>>>>           {
>>>>>> @@ -1918,6 +1923,10 @@ input_offload_tables (void)
>>>>>>             tree var_decl
>>>>>>           = lto_file_decl_data_get_var_decl (file_data, decl_index);
>>>>>>             vec_safe_push (offload_vars, var_decl);
>>>>>> +
>>>>>> +          /* Prevent IPA from removing var_decl as unused, since
>>>>>> there
>>>>>> +         may be no refs to var_decl in offload LTO mode.  */
>>>>>> +          varpool_node::get (var_decl)->force_output = 1;
>>>>>>           }
>>>>
>>>> This doesn't work when there is more than one LTO partition, because
>>>> only first
>>>> partition contains full offload table to maintain correct order, but
>>>> cgraph and
>>>> varpool nodes aren't necessarily created for the first partition.
>>>> To reproduce:
>>>>
>>>> $ make check-target-libgomp RUNTESTFLAGS="c.exp=for-*
>>>> --target_board=unix/-flto"
>>>> FAIL: libgomp.c/for-3.c (internal compiler error)
>>>> FAIL: libgomp.c/for-5.c (internal compiler error)
>>>> FAIL: libgomp.c/for-6.c (internal compiler error)
>>>> $ make check-target-libgomp RUNTESTFLAGS="c++.exp=for-*
>>>> --target_board=unix/-flto"
>>>> FAIL: libgomp.c++/for-11.C (internal compiler error)
>>>> FAIL: libgomp.c++/for-13.C (internal compiler error)
>>>> FAIL: libgomp.c++/for-14.C (internal compiler error)
>>>
>>> This works for me.
>>>
>>> OK for trunk?
>>>
>>> Thanks,
>>> - Tom
>>>
>>
>>> Check that cgraph/varpool_node exists before use in input_offload_tables
>>>
>>> 2016-01-26  Tom de Vries  <tom@codesourcery.com>
>>>
>>>     * lto-cgraph.c (input_offload_tables): Check that
>>> cgraph/varpool_node
>>>     exists before use.
>>
>> In this case they will be not marked as force_output in other
>> partitions (except
>> the first one).
>
> AFAIU, that's not the case.
>
> If we're splitting up lto compilation over partitions, it means we're
> first calling lto1 in WPA mode. We'll read in all offload tables, and
> mark all symbols with force_output, and when writing out the partitions,
> we'll write the offload symbols out with force_output set.
>
> This updated patch only does the force_output marking for offload
> symbols in WPA or LTO. It's not necessary in LTRANS mode.
>
> Bootstrapped and reg-tested on x86_64.
>
> Build for nvidia accelerator and reg-tested libgomp with various lto
> settings.
>
> OK for trunk, stage4?
>

Ping. Original submission here: 
https://gcc.gnu.org/ml/gcc-patches/2016-02/msg00549.html .

Thanks,
- Tom

> 0006-Don-t-mark-offload-symbols-with-force_output-in-ltrans.patch
>
>
> Don't mark offload symbols with force_output in ltrans
>
> 2016-02-08  Tom de Vries  <tom@codesourcery.com>
>
> 	PR lto/69655
> 	* lto-cgraph.c (input_offload_tables): Add and handle bool parameter
> 	do_force_output.
> 	* lto-streamer.h (input_offload_tables): Add and handle bool parameter.
>
> 	* lto.c (read_cgraph_and_symbols): Call input_offload_tables with
> 	argument.
>
> ---
>   gcc/lto-cgraph.c   | 8 +++++---
>   gcc/lto-streamer.h | 2 +-
>   gcc/lto/lto.c      | 2 +-
>   3 files changed, 7 insertions(+), 5 deletions(-)
>
> diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
> index 0634779..95c446d 100644
> --- a/gcc/lto-cgraph.c
> +++ b/gcc/lto-cgraph.c
> @@ -1885,7 +1885,7 @@ input_symtab (void)
>      target code, and store them into OFFLOAD_FUNCS and OFFLOAD_VARS.  */
>
>   void
> -input_offload_tables (void)
> +input_offload_tables (bool do_force_output)
>   {
>     struct lto_file_decl_data **file_data_vec = lto_get_file_decl_data ();
>     struct lto_file_decl_data *file_data;
> @@ -1915,7 +1915,8 @@ input_offload_tables (void)
>   	      /* Prevent IPA from removing fn_decl as unreachable, since there
>   		 may be no refs from the parent function to child_fn in offload
>   		 LTO mode.  */
> -	      cgraph_node::get (fn_decl)->mark_force_output ();
> +	      if (do_force_output)
> +		cgraph_node::get (fn_decl)->mark_force_output ();
>   	    }
>   	  else if (tag == LTO_symtab_variable)
>   	    {
> @@ -1926,7 +1927,8 @@ input_offload_tables (void)
>
>   	      /* Prevent IPA from removing var_decl as unused, since there
>   		 may be no refs to var_decl in offload LTO mode.  */
> -	      varpool_node::get (var_decl)->force_output = 1;
> +	      if (do_force_output)
> +		varpool_node::get (var_decl)->force_output = 1;
>   	    }
>   	  else
>   	    fatal_error (input_location,
> diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h
> index 0cb200e..f391161 100644
> --- a/gcc/lto-streamer.h
> +++ b/gcc/lto-streamer.h
> @@ -915,7 +915,7 @@ bool lto_symtab_encoder_encode_initializer_p (lto_symtab_encoder_t,
>   void output_symtab (void);
>   void input_symtab (void);
>   void output_offload_tables (void);
> -void input_offload_tables (void);
> +void input_offload_tables (bool);
>   bool referenced_from_other_partition_p (struct ipa_ref_list *,
>   				        lto_symtab_encoder_t);
>   bool reachable_from_other_partition_p (struct cgraph_node *,
> diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
> index ecec1bc..2736c5c 100644
> --- a/gcc/lto/lto.c
> +++ b/gcc/lto/lto.c
> @@ -2855,7 +2855,7 @@ read_cgraph_and_symbols (unsigned nfiles, const char **fnames)
>     /* Read the symtab.  */
>     input_symtab ();
>
> -  input_offload_tables ();
> +  input_offload_tables (!flag_ltrans);
>
>     /* Store resolutions into the symbol table.  */
>
>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [PING][PATCH] Don't mark offload symbols with force_output in ltrans
  2016-02-15  7:38                                     ` [PING][PATCH] Don't mark offload symbols with force_output in ltrans Tom de Vries
@ 2016-02-15  9:07                                       ` Richard Biener
  0 siblings, 0 replies; 44+ messages in thread
From: Richard Biener @ 2016-02-15  9:07 UTC (permalink / raw)
  To: Tom de Vries; +Cc: Ilya Verbin, Thomas Schwinge, Jakub Jelinek, gcc-patches

On Mon, 15 Feb 2016, Tom de Vries wrote:

> [ was: [PING][PATCH] Mark symbols in offload tables with force_output in
> read_offload_tables ]
> 
> On 08/02/16 14:20, Tom de Vries wrote:
> > On 26/01/16 14:01, Ilya Verbin wrote:
> > > On Tue, Jan 26, 2016 at 13:21:57 +0100, Tom de Vries wrote:
> > > > On 25/01/16 14:27, Ilya Verbin wrote:
> > > > > On Tue, Jan 05, 2016 at 15:56:15 +0100, Tom de Vries wrote:
> > > > > > > diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
> > > > > > > index 62e5454..cdaee41 100644
> > > > > > > --- a/gcc/lto-cgraph.c
> > > > > > > +++ b/gcc/lto-cgraph.c
> > > > > > > @@ -1911,6 +1911,11 @@ input_offload_tables (void)
> > > > > > >             tree fn_decl
> > > > > > >           = lto_file_decl_data_get_fn_decl (file_data,
> > > > > > > decl_index);
> > > > > > >             vec_safe_push (offload_funcs, fn_decl);
> > > > > > > +
> > > > > > > +          /* Prevent IPA from removing fn_decl as unreachable,
> > > > > > > since there
> > > > > > > +         may be no refs from the parent function to child_fn in
> > > > > > > offload
> > > > > > > +         LTO mode.  */
> > > > > > > +          cgraph_node::get (fn_decl)->mark_force_output ();
> > > > > > >           }
> > > > > > >         else if (tag == LTO_symtab_variable)
> > > > > > >           {
> > > > > > > @@ -1918,6 +1923,10 @@ input_offload_tables (void)
> > > > > > >             tree var_decl
> > > > > > >           = lto_file_decl_data_get_var_decl (file_data,
> > > > > > > decl_index);
> > > > > > >             vec_safe_push (offload_vars, var_decl);
> > > > > > > +
> > > > > > > +          /* Prevent IPA from removing var_decl as unused, since
> > > > > > > there
> > > > > > > +         may be no refs to var_decl in offload LTO mode.  */
> > > > > > > +          varpool_node::get (var_decl)->force_output = 1;
> > > > > > >           }
> > > > > 
> > > > > This doesn't work when there is more than one LTO partition, because
> > > > > only first
> > > > > partition contains full offload table to maintain correct order, but
> > > > > cgraph and
> > > > > varpool nodes aren't necessarily created for the first partition.
> > > > > To reproduce:
> > > > > 
> > > > > $ make check-target-libgomp RUNTESTFLAGS="c.exp=for-*
> > > > > --target_board=unix/-flto"
> > > > > FAIL: libgomp.c/for-3.c (internal compiler error)
> > > > > FAIL: libgomp.c/for-5.c (internal compiler error)
> > > > > FAIL: libgomp.c/for-6.c (internal compiler error)
> > > > > $ make check-target-libgomp RUNTESTFLAGS="c++.exp=for-*
> > > > > --target_board=unix/-flto"
> > > > > FAIL: libgomp.c++/for-11.C (internal compiler error)
> > > > > FAIL: libgomp.c++/for-13.C (internal compiler error)
> > > > > FAIL: libgomp.c++/for-14.C (internal compiler error)
> > > > 
> > > > This works for me.
> > > > 
> > > > OK for trunk?
> > > > 
> > > > Thanks,
> > > > - Tom
> > > > 
> > > 
> > > > Check that cgraph/varpool_node exists before use in input_offload_tables
> > > > 
> > > > 2016-01-26  Tom de Vries  <tom@codesourcery.com>
> > > > 
> > > >     * lto-cgraph.c (input_offload_tables): Check that
> > > > cgraph/varpool_node
> > > >     exists before use.
> > > 
> > > In this case they will be not marked as force_output in other
> > > partitions (except
> > > the first one).
> > 
> > AFAIU, that's not the case.
> > 
> > If we're splitting up lto compilation over partitions, it means we're
> > first calling lto1 in WPA mode. We'll read in all offload tables, and
> > mark all symbols with force_output, and when writing out the partitions,
> > we'll write the offload symbols out with force_output set.
> > 
> > This updated patch only does the force_output marking for offload
> > symbols in WPA or LTO. It's not necessary in LTRANS mode.
> > 
> > Bootstrapped and reg-tested on x86_64.
> > 
> > Build for nvidia accelerator and reg-tested libgomp with various lto
> > settings.
> > 
> > OK for trunk, stage4?
> > 
> 
> Ping. Original submission here:
> https://gcc.gnu.org/ml/gcc-patches/2016-02/msg00549.html .

Ok.

Richhard.

> Thanks,
> - Tom
> 
> > 0006-Don-t-mark-offload-symbols-with-force_output-in-ltrans.patch
> > 
> > 
> > Don't mark offload symbols with force_output in ltrans
> > 
> > 2016-02-08  Tom de Vries  <tom@codesourcery.com>
> > 
> > 	PR lto/69655
> > 	* lto-cgraph.c (input_offload_tables): Add and handle bool parameter
> > 	do_force_output.
> > 	* lto-streamer.h (input_offload_tables): Add and handle bool
> > parameter.
> > 
> > 	* lto.c (read_cgraph_and_symbols): Call input_offload_tables with
> > 	argument.
> > 
> > ---
> >   gcc/lto-cgraph.c   | 8 +++++---
> >   gcc/lto-streamer.h | 2 +-
> >   gcc/lto/lto.c      | 2 +-
> >   3 files changed, 7 insertions(+), 5 deletions(-)
> > 
> > diff --git a/gcc/lto-cgraph.c b/gcc/lto-cgraph.c
> > index 0634779..95c446d 100644
> > --- a/gcc/lto-cgraph.c
> > +++ b/gcc/lto-cgraph.c
> > @@ -1885,7 +1885,7 @@ input_symtab (void)
> >      target code, and store them into OFFLOAD_FUNCS and OFFLOAD_VARS.  */
> > 
> >   void
> > -input_offload_tables (void)
> > +input_offload_tables (bool do_force_output)
> >   {
> >     struct lto_file_decl_data **file_data_vec = lto_get_file_decl_data ();
> >     struct lto_file_decl_data *file_data;
> > @@ -1915,7 +1915,8 @@ input_offload_tables (void)
> >   	      /* Prevent IPA from removing fn_decl as unreachable, since there
> >   		 may be no refs from the parent function to child_fn in
> > offload
> >   		 LTO mode.  */
> > -	      cgraph_node::get (fn_decl)->mark_force_output ();
> > +	      if (do_force_output)
> > +		cgraph_node::get (fn_decl)->mark_force_output ();
> >   	    }
> >   	  else if (tag == LTO_symtab_variable)
> >   	    {
> > @@ -1926,7 +1927,8 @@ input_offload_tables (void)
> > 
> >   	      /* Prevent IPA from removing var_decl as unused, since there
> >   		 may be no refs to var_decl in offload LTO mode.  */
> > -	      varpool_node::get (var_decl)->force_output = 1;
> > +	      if (do_force_output)
> > +		varpool_node::get (var_decl)->force_output = 1;
> >   	    }
> >   	  else
> >   	    fatal_error (input_location,
> > diff --git a/gcc/lto-streamer.h b/gcc/lto-streamer.h
> > index 0cb200e..f391161 100644
> > --- a/gcc/lto-streamer.h
> > +++ b/gcc/lto-streamer.h
> > @@ -915,7 +915,7 @@ bool lto_symtab_encoder_encode_initializer_p
> > (lto_symtab_encoder_t,
> >   void output_symtab (void);
> >   void input_symtab (void);
> >   void output_offload_tables (void);
> > -void input_offload_tables (void);
> > +void input_offload_tables (bool);
> >   bool referenced_from_other_partition_p (struct ipa_ref_list *,
> >   				        lto_symtab_encoder_t);
> >   bool reachable_from_other_partition_p (struct cgraph_node *,
> > diff --git a/gcc/lto/lto.c b/gcc/lto/lto.c
> > index ecec1bc..2736c5c 100644
> > --- a/gcc/lto/lto.c
> > +++ b/gcc/lto/lto.c
> > @@ -2855,7 +2855,7 @@ read_cgraph_and_symbols (unsigned nfiles, const char
> > **fnames)
> >     /* Read the symtab.  */
> >     input_symtab ();
> > 
> > -  input_offload_tables ();
> > +  input_offload_tables (!flag_ltrans);
> > 
> >     /* Store resolutions into the symbol table.  */
> > 
> > 
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2016-02-15  9:07 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-11-30  9:00 [PATCH, PR46032] Handle BUILT_IN_GOMP_PARALLEL in ipa-pta Tom de Vries
2015-11-30  9:36 ` Richard Biener
2015-11-30 12:20   ` Tom de Vries
2015-11-30 13:32     ` Richard Biener
2015-11-30 13:35       ` Jakub Jelinek
2015-12-03 11:49         ` Tom de Vries
2015-11-30 16:44       ` Tom de Vries
2015-11-30 16:52         ` Jakub Jelinek
2015-11-30 18:00           ` Tom de Vries
2015-12-01 12:27             ` Christophe Lyon
2015-12-01 12:30               ` Jakub Jelinek
2015-12-01 14:49                 ` Tom de Vries
2015-12-01 14:27         ` [PATCH] Handle BUILT_IN_GOACC_PARALLEL " Tom de Vries
2015-12-01 14:38           ` Richard Biener
2015-12-01 14:44           ` Jakub Jelinek
2015-12-01 23:48             ` Tom de Vries
2015-12-02  9:31               ` Jakub Jelinek
2015-12-02 17:58           ` Thomas Schwinge
2015-12-02 23:32             ` Tom de Vries
2015-12-03  0:10               ` Tom de Vries
2015-12-03  0:27                 ` Tom de Vries
2015-12-03  8:59                   ` Richard Biener
2015-12-03 11:09                     ` Tom de Vries
2015-12-03 11:13                       ` Jakub Jelinek
2015-12-03 11:13                       ` Richard Biener
2015-12-10 13:15                       ` Tom de Vries
2015-12-16 16:02                         ` Tom de Vries
2016-01-05 14:56                           ` [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables Tom de Vries
2016-01-25 13:27                             ` Ilya Verbin
2016-01-26 12:22                               ` Tom de Vries
2016-01-26 13:02                                 ` Ilya Verbin
2016-02-08 13:20                                   ` Tom de Vries
2016-02-08 15:02                                     ` Ilya Verbin
2016-02-15  7:38                                     ` [PING][PATCH] Don't mark offload symbols with force_output in ltrans Tom de Vries
2016-02-15  9:07                                       ` Richard Biener
2016-01-26 14:13                                 ` [PING][PATCH] Mark symbols in offload tables with force_output in read_offload_tables Richard Biener
2016-01-08  8:17                           ` [PATCH] Handle BUILT_IN_GOACC_PARALLEL in ipa-pta Richard Biener
2016-01-14  9:31                             ` Thomas Schwinge
2016-01-14  9:32                               ` Richard Biener
2015-12-09 10:01     ` [PATCH, PR68716] Fix GOMP/GOACC_parallel handling in find_func_clobbers Tom de Vries
2015-12-09 10:04       ` Jakub Jelinek
2015-12-09 10:09         ` Richard Biener
2015-12-09 10:11         ` Tom de Vries
2015-12-10  9:01       ` [committed, obvious] Remove invalid assert in find_func_aliases_for_builtin_call Tom de Vries

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).