[PATCH][AARCH64] Emulating aligned mask loads on AArch64

public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed

* [PATCH][AARCH64] Emulating aligned mask loads on AArch64
@ 2015-09-18 10:41 Pawel Kupidura
  2015-09-18 11:00 ` James Greenhalgh
  0 siblings, 1 reply; 3+ messages in thread
From: Pawel Kupidura @ 2015-09-18 10:41 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 824 bytes --]

This patch uses max reductions to emulate aligned masked loads on AArch64.
It reduces the mask to a scalar that is nonzero if any mask element is true,
then uses that scalar to select between the real address and a scratchpad
address.

The idea is that if the vector load is aligned, it cannot cross a page
boundary and so cannot partially fault.  It is safe to load from the
address (and use only some of the result) if any mask element is true.

The patch provided a 15% speed improvement for simple microbenchmarks.

There were several spec2k6 benchmarks affected by patch: 400.perlbench,
403.gcc, 436.cactusADM, 454.calculix and 464.h264.  However, the changes
had no measureable effect on performance.

Regression-tested on x86_64-linux-gnu, aarch64-linux-gnu and 
arm-linux-gnueabi.

Thanks,
Pawel

[-- Attachment #2: patch --]
[-- Type: text/plain, Size: 8722 bytes --]

diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
index 73f2729..066d133 100644
--- a/gcc/optabs-query.h
+++ b/gcc/optabs-query.h
@@ -134,5 +134,6 @@ bool can_vec_mask_load_store_p (machine_mode, bool);
 bool can_compare_and_swap_p (machine_mode, bool);
 bool can_atomic_exchange_p (machine_mode, bool);
 bool lshift_cheap_p (bool);
+bool supports_umax_reduction ();
 
 #endif
diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
index 254089f..23a85a4 100644
--- a/gcc/optabs-query.c
+++ b/gcc/optabs-query.c
@@ -463,6 +463,21 @@ can_mult_highpart_p (machine_mode mode, bool uns_p)
   return 0;
 }
 
+/* Return true if target supports unsigned max reduction for any mode.  */
+
+bool
+supports_umax_reduction ()
+{
+  machine_mode mode;
+
+  for (mode = MIN_MODE_VECTOR_INT; mode <= MAX_MODE_VECTOR_INT;
+       mode = (machine_mode) (mode + 1))
+    if (optab_handler (reduc_umax_scal_optab, mode) != CODE_FOR_nothing)
+      return true;
+
+  return false;
+}
+
 /* Return true if target supports vector masked load/store for mode.  */
 
 bool
diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-4.c b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
new file mode 100644
index 0000000..98db8e3
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
@@ -0,0 +1,65 @@
+/* { dg-require-effective-target umax_reduction } */
+
+#define N 512
+#define K 32
+
+extern void abort (void) __attribute__((noreturn));
+
+int a[N] __attribute__ ((aligned (16)));
+int b[N] __attribute__ ((aligned (16)));
+int c[N] __attribute__ ((aligned (16)));
+
+__attribute__ ((noinline)) void
+init_arrays () {
+  int i;
+
+  for (i = 0; i < N / 4; ++i)
+    a[i] = K + 1;
+
+  for (i = N / 4; i < N / 2; ++i)
+    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
+
+  for (i = N / 2; i < N; ++i)
+    a[i] = K - 1;
+
+  for (i = 0; i < N; ++i)
+    b[i] = i;
+}
+
+__attribute__ ((noinline)) void
+check_array () {
+  int i = 0;
+
+  for (i = 0; i < N / 4; ++i)
+    if (c[i] != a[i])
+      abort ();
+
+  for (i = N / 4; i < N / 2; ++i)
+    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
+      abort ();
+
+  for (i = N / 2; i < N; ++i)
+    if (c[i] != b[i])
+      abort ();
+}
+
+__attribute__ ((noinline)) void
+main1 (int* bp) {
+  int i;
+
+  for (i = 0; i < N; ++i)
+    c[i] = a[i] < K ? bp[i] : a[i];
+
+  check_array ();
+}
+
+int main (void) {
+  init_arrays ();
+
+  main1 (b);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+
diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-5.c b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
new file mode 100644
index 0000000..93bfaa1
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
@@ -0,0 +1,65 @@
+/* { dg-require-effective-target umax_reduction } */
+
+#define N 512
+#define K 32
+
+extern void abort (void) __attribute__((noreturn));
+
+int a[N] __attribute__ ((aligned (16)));
+int b[N];
+int c[N] __attribute__ ((aligned (16)));
+
+__attribute__ ((noinline)) void
+init_arrays () {
+  int i;
+
+  for (i = 0; i < N / 4; ++i)
+    a[i] = K + 1;
+
+  for (i = N / 4; i < N / 2; ++i)
+    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
+
+  for (i = N / 2; i < N; ++i)
+    a[i] = K - 1;
+
+  for (i = 0; i < N; ++i)
+    b[i] = i;
+}
+
+__attribute__ ((noinline)) void
+check_array () {
+  int i = 0;
+
+  for (i = 0; i < N / 4; ++i)
+    if (c[i] != a[i])
+      abort ();
+
+  for (i = N / 4; i < N / 2; ++i)
+    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
+      abort ();
+
+  for (i = N / 2; i < N; ++i)
+    if (c[i] != b[i])
+      abort ();
+}
+
+__attribute__ ((noinline)) void
+main1 (int* bp) {
+  int i;
+
+  for (i = 0; i < N; ++i)
+    c[i] = a[i] < K ? bp[i] : a[i];
+
+  check_array ();
+}
+
+int main (void) {
+  init_arrays ();
+
+  main1 (b);
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+
diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
index a465eb1..9b1c338 100644
--- a/gcc/testsuite/lib/target-supports.exp
+++ b/gcc/testsuite/lib/target-supports.exp
@@ -6449,3 +6449,14 @@ proc check_effective_target_comdat_group {} {
 	int (*fn) () = foo;
     }]
 }
+
+# Return 1 if the target supports unsigned max vector reduction.
+
+proc check_effective_target_umax_reduction { } {
+    if { [istarget aarch64*-*-*] } {
+	return 1;
+    } else {
+	return 0;
+    }
+}
+
diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
index 0987884..4f84705 100644
--- a/gcc/tree-if-conv.c
+++ b/gcc/tree-if-conv.c
@@ -811,7 +811,8 @@ ifcvt_can_use_mask_load_store (gimple stmt)
       || VECTOR_MODE_P (mode))
     return false;
 
-  if (can_vec_mask_load_store_p (mode, is_load))
+  if (can_vec_mask_load_store_p (mode, is_load)
+      || (is_load && supports_umax_reduction ()))
     return true;
 
   return false;
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
index 671e613..4f8c2c5 100644
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -5749,10 +5749,19 @@ vect_supportable_dr_alignment (struct data_reference *dr,
   /* For now assume all conditional loads/stores support unaligned
      access without any special code.  */
   if (is_gimple_call (stmt)
-      && gimple_call_internal_p (stmt)
-      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
-	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
-    return dr_unaligned_supported;
+      && gimple_call_internal_p (stmt))
+    {
+      if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
+	return (can_vec_mask_load_store_p (mode, true)
+		? dr_unaligned_supported
+		: dr_unaligned_unsupported);
+      else if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+	{
+	  gcc_checking_assert (can_vec_mask_load_store_p (
+				TYPE_MODE (TREE_TYPE (vectype)), false));
+	  return dr_unaligned_supported;
+	}
+    }
 
   if (loop_vinfo)
     {
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index d4a436d..2a8c231 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1840,7 +1840,9 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
 				 : DR_STEP (dr), size_zero_node) <= 0)
     return false;
   else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
-	   || !can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
+	   || !(can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store)
+		|| (optab_handler (reduc_umax_scal_optab,
+				   TYPE_MODE (vectype)) != CODE_FOR_nothing)))
     return false;
 
   if (TREE_CODE (mask) != SSA_NAME)
@@ -2140,12 +2142,43 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
 	    misalign = DR_MISALIGNMENT (dr);
 	  set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
 				  misalign);
-	  new_stmt
-	    = gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
-					  gimple_call_arg (stmt, 1),
-					  vec_mask);
-	  gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
-	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
+
+	  if (can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
+	    {
+	      new_stmt
+		= gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
+					      gimple_call_arg (stmt, 1),
+					      vec_mask);
+	      gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	    }
+	  else
+	    {
+	      tree anytrue = make_temp_ssa_name (TREE_TYPE (
+						  TREE_TYPE (vec_mask)),
+						 NULL, "_anytrue");
+	      tree reduction = build1 (REDUC_MAX_EXPR, TREE_TYPE (anytrue),
+				       vec_mask);
+	      gimple anytrue_init = gimple_build_assign (anytrue, reduction);
+	      vect_finish_stmt_generation (stmt, anytrue_init, gsi);
+
+	      tree temp_addr = build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
+				       create_tmp_var (vectype, "safevec"));
+	      tree vec_cond_expr = build3 (COND_EXPR, vectype, anytrue,
+					   dataref_ptr, temp_addr);
+
+	      tree safeb = make_temp_ssa_name (TREE_TYPE (dataref_ptr),
+					       NULL, "_safeb");
+	      gimple safeb_init = gimple_build_assign (safeb, vec_cond_expr);
+	      vect_finish_stmt_generation (stmt, safeb_init, gsi);
+
+	      tree load = build2 (MEM_REF, vectype, safeb,
+				  build_int_cst (ptr_type_node, 0));
+	      new_stmt
+		= gimple_build_assign (make_ssa_name (vec_dest), load);
+	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
+	    }
+
 	  if (i == 0)
 	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
 	  else

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH][AARCH64] Emulating aligned mask loads on AArch64
  2015-09-18 10:41 [PATCH][AARCH64] Emulating aligned mask loads on AArch64 Pawel Kupidura
@ 2015-09-18 11:00 ` James Greenhalgh
  2015-09-18 12:04   ` Richard Biener
  0 siblings, 1 reply; 3+ messages in thread
From: James Greenhalgh @ 2015-09-18 11:00 UTC (permalink / raw)
  To: Pawel Kupidura; +Cc: gcc-patches, rguenther, ook

On Fri, Sep 18, 2015 at 11:24:50AM +0100, Pawel Kupidura wrote:
> This patch uses max reductions to emulate aligned masked loads on AArch64.
> It reduces the mask to a scalar that is nonzero if any mask element is true,
> then uses that scalar to select between the real address and a scratchpad
> address.
> 
> The idea is that if the vector load is aligned, it cannot cross a page
> boundary and so cannot partially fault.  It is safe to load from the
> address (and use only some of the result) if any mask element is true.
> 
> The patch provided a 15% speed improvement for simple microbenchmarks.
> 
> There were several spec2k6 benchmarks affected by patch: 400.perlbench,
> 403.gcc, 436.cactusADM, 454.calculix and 464.h264.  However, the changes
> had no measureable effect on performance.
> 
> Regression-tested on x86_64-linux-gnu, aarch64-linux-gnu and 
> arm-linux-gnueabi.

Hi Pawel, this patch doesn't look AArch64 specific to me. You will probably
get more traction with reviews if you post it tagged appropriately and
with the relevant maintainers on CC, in this case - as an auto-vectorizer
patch, Richard Biener and Zdenek Dvorak.

It is also customary to include a ChangeLog in your submissions, this can
be useful for seeign at a glance what your patch modifies.

Thanks,
James

> diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
> index 73f2729..066d133 100644
> --- a/gcc/optabs-query.h
> +++ b/gcc/optabs-query.h
> @@ -134,5 +134,6 @@ bool can_vec_mask_load_store_p (machine_mode, bool);
>  bool can_compare_and_swap_p (machine_mode, bool);
>  bool can_atomic_exchange_p (machine_mode, bool);
>  bool lshift_cheap_p (bool);
> +bool supports_umax_reduction ();
>  
>  #endif
> diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
> index 254089f..23a85a4 100644
> --- a/gcc/optabs-query.c
> +++ b/gcc/optabs-query.c
> @@ -463,6 +463,21 @@ can_mult_highpart_p (machine_mode mode, bool uns_p)
>    return 0;
>  }
>  
> +/* Return true if target supports unsigned max reduction for any mode.  */
> +
> +bool
> +supports_umax_reduction ()
> +{
> +  machine_mode mode;
> +
> +  for (mode = MIN_MODE_VECTOR_INT; mode <= MAX_MODE_VECTOR_INT;
> +       mode = (machine_mode) (mode + 1))
> +    if (optab_handler (reduc_umax_scal_optab, mode) != CODE_FOR_nothing)
> +      return true;
> +
> +  return false;
> +}
> +
>  /* Return true if target supports vector masked load/store for mode.  */
>  
>  bool
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-4.c b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
> new file mode 100644
> index 0000000..98db8e3
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
> @@ -0,0 +1,65 @@
> +/* { dg-require-effective-target umax_reduction } */
> +
> +#define N 512
> +#define K 32
> +
> +extern void abort (void) __attribute__((noreturn));
> +
> +int a[N] __attribute__ ((aligned (16)));
> +int b[N] __attribute__ ((aligned (16)));
> +int c[N] __attribute__ ((aligned (16)));
> +
> +__attribute__ ((noinline)) void
> +init_arrays () {
> +  int i;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    a[i] = K + 1;
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
> +
> +  for (i = N / 2; i < N; ++i)
> +    a[i] = K - 1;
> +
> +  for (i = 0; i < N; ++i)
> +    b[i] = i;
> +}
> +
> +__attribute__ ((noinline)) void
> +check_array () {
> +  int i = 0;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    if (c[i] != a[i])
> +      abort ();
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
> +      abort ();
> +
> +  for (i = N / 2; i < N; ++i)
> +    if (c[i] != b[i])
> +      abort ();
> +}
> +
> +__attribute__ ((noinline)) void
> +main1 (int* bp) {
> +  int i;
> +
> +  for (i = 0; i < N; ++i)
> +    c[i] = a[i] < K ? bp[i] : a[i];
> +
> +  check_array ();
> +}
> +
> +int main (void) {
> +  init_arrays ();
> +
> +  main1 (b);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +
> diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-5.c b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
> new file mode 100644
> index 0000000..93bfaa1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
> @@ -0,0 +1,65 @@
> +/* { dg-require-effective-target umax_reduction } */
> +
> +#define N 512
> +#define K 32
> +
> +extern void abort (void) __attribute__((noreturn));
> +
> +int a[N] __attribute__ ((aligned (16)));
> +int b[N];
> +int c[N] __attribute__ ((aligned (16)));
> +
> +__attribute__ ((noinline)) void
> +init_arrays () {
> +  int i;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    a[i] = K + 1;
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
> +
> +  for (i = N / 2; i < N; ++i)
> +    a[i] = K - 1;
> +
> +  for (i = 0; i < N; ++i)
> +    b[i] = i;
> +}
> +
> +__attribute__ ((noinline)) void
> +check_array () {
> +  int i = 0;
> +
> +  for (i = 0; i < N / 4; ++i)
> +    if (c[i] != a[i])
> +      abort ();
> +
> +  for (i = N / 4; i < N / 2; ++i)
> +    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
> +      abort ();
> +
> +  for (i = N / 2; i < N; ++i)
> +    if (c[i] != b[i])
> +      abort ();
> +}
> +
> +__attribute__ ((noinline)) void
> +main1 (int* bp) {
> +  int i;
> +
> +  for (i = 0; i < N; ++i)
> +    c[i] = a[i] < K ? bp[i] : a[i];
> +
> +  check_array ();
> +}
> +
> +int main (void) {
> +  init_arrays ();
> +
> +  main1 (b);
> +
> +  return 0;
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> +
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> index a465eb1..9b1c338 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -6449,3 +6449,14 @@ proc check_effective_target_comdat_group {} {
>  	int (*fn) () = foo;
>      }]
>  }
> +
> +# Return 1 if the target supports unsigned max vector reduction.
> +
> +proc check_effective_target_umax_reduction { } {
> +    if { [istarget aarch64*-*-*] } {
> +	return 1;
> +    } else {
> +	return 0;
> +    }
> +}
> +
> diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> index 0987884..4f84705 100644
> --- a/gcc/tree-if-conv.c
> +++ b/gcc/tree-if-conv.c
> @@ -811,7 +811,8 @@ ifcvt_can_use_mask_load_store (gimple stmt)
>        || VECTOR_MODE_P (mode))
>      return false;
>  
> -  if (can_vec_mask_load_store_p (mode, is_load))
> +  if (can_vec_mask_load_store_p (mode, is_load)
> +      || (is_load && supports_umax_reduction ()))
>      return true;
>  
>    return false;
> diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> index 671e613..4f8c2c5 100644
> --- a/gcc/tree-vect-data-refs.c
> +++ b/gcc/tree-vect-data-refs.c
> @@ -5749,10 +5749,19 @@ vect_supportable_dr_alignment (struct data_reference *dr,
>    /* For now assume all conditional loads/stores support unaligned
>       access without any special code.  */
>    if (is_gimple_call (stmt)
> -      && gimple_call_internal_p (stmt)
> -      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
> -	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
> -    return dr_unaligned_supported;
> +      && gimple_call_internal_p (stmt))
> +    {
> +      if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
> +	return (can_vec_mask_load_store_p (mode, true)
> +		? dr_unaligned_supported
> +		: dr_unaligned_unsupported);
> +      else if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
> +	{
> +	  gcc_checking_assert (can_vec_mask_load_store_p (
> +				TYPE_MODE (TREE_TYPE (vectype)), false));
> +	  return dr_unaligned_supported;
> +	}
> +    }
>  
>    if (loop_vinfo)
>      {
> diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> index d4a436d..2a8c231 100644
> --- a/gcc/tree-vect-stmts.c
> +++ b/gcc/tree-vect-stmts.c
> @@ -1840,7 +1840,9 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
>  				 : DR_STEP (dr), size_zero_node) <= 0)
>      return false;
>    else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
> -	   || !can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
> +	   || !(can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store)
> +		|| (optab_handler (reduc_umax_scal_optab,
> +				   TYPE_MODE (vectype)) != CODE_FOR_nothing)))
>      return false;
>  
>    if (TREE_CODE (mask) != SSA_NAME)
> @@ -2140,12 +2142,43 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
>  	    misalign = DR_MISALIGNMENT (dr);
>  	  set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
>  				  misalign);
> -	  new_stmt
> -	    = gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
> -					  gimple_call_arg (stmt, 1),
> -					  vec_mask);
> -	  gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
> -	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +
> +	  if (can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
> +	    {
> +	      new_stmt
> +		= gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
> +					      gimple_call_arg (stmt, 1),
> +					      vec_mask);
> +	      gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
> +	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +	    }
> +	  else
> +	    {
> +	      tree anytrue = make_temp_ssa_name (TREE_TYPE (
> +						  TREE_TYPE (vec_mask)),
> +						 NULL, "_anytrue");
> +	      tree reduction = build1 (REDUC_MAX_EXPR, TREE_TYPE (anytrue),
> +				       vec_mask);
> +	      gimple anytrue_init = gimple_build_assign (anytrue, reduction);
> +	      vect_finish_stmt_generation (stmt, anytrue_init, gsi);
> +
> +	      tree temp_addr = build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
> +				       create_tmp_var (vectype, "safevec"));
> +	      tree vec_cond_expr = build3 (COND_EXPR, vectype, anytrue,
> +					   dataref_ptr, temp_addr);
> +
> +	      tree safeb = make_temp_ssa_name (TREE_TYPE (dataref_ptr),
> +					       NULL, "_safeb");
> +	      gimple safeb_init = gimple_build_assign (safeb, vec_cond_expr);
> +	      vect_finish_stmt_generation (stmt, safeb_init, gsi);
> +
> +	      tree load = build2 (MEM_REF, vectype, safeb,
> +				  build_int_cst (ptr_type_node, 0));
> +	      new_stmt
> +		= gimple_build_assign (make_ssa_name (vec_dest), load);
> +	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
> +	    }
> +
>  	  if (i == 0)
>  	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
>  	  else

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH][AARCH64] Emulating aligned mask loads on AArch64
  2015-09-18 11:00 ` James Greenhalgh
@ 2015-09-18 12:04   ` Richard Biener
  0 siblings, 0 replies; 3+ messages in thread
From: Richard Biener @ 2015-09-18 12:04 UTC (permalink / raw)
  To: James Greenhalgh; +Cc: Pawel Kupidura, gcc-patches, ook

On Fri, 18 Sep 2015, James Greenhalgh wrote:

> On Fri, Sep 18, 2015 at 11:24:50AM +0100, Pawel Kupidura wrote:
> > This patch uses max reductions to emulate aligned masked loads on AArch64.
> > It reduces the mask to a scalar that is nonzero if any mask element is true,
> > then uses that scalar to select between the real address and a scratchpad
> > address.
> > 
> > The idea is that if the vector load is aligned, it cannot cross a page
> > boundary and so cannot partially fault.  It is safe to load from the
> > address (and use only some of the result) if any mask element is true.
> > 
> > The patch provided a 15% speed improvement for simple microbenchmarks.
> > 
> > There were several spec2k6 benchmarks affected by patch: 400.perlbench,
> > 403.gcc, 436.cactusADM, 454.calculix and 464.h264.  However, the changes
> > had no measureable effect on performance.
> > 
> > Regression-tested on x86_64-linux-gnu, aarch64-linux-gnu and 
> > arm-linux-gnueabi.
> 
> Hi Pawel, this patch doesn't look AArch64 specific to me. You will probably
> get more traction with reviews if you post it tagged appropriately and
> with the relevant maintainers on CC, in this case - as an auto-vectorizer
> patch, Richard Biener and Zdenek Dvorak.
> 
> It is also customary to include a ChangeLog in your submissions, this can
> be useful for seeign at a glance what your patch modifies.

Some comments - first of all you don't need REDUC_MAX_EXPR, you only
need to do a test for non-zero mask.  You can do this with querying
an integer mode of vec_mask size and then using

  VIEW_CONVERT_EXPR<type-with-that-mode> (vec_mask) != 0 ? ...

now to your assumption that an "aligned" vector load cannot trap.  I
think that assumption is wrong if the alignment requirement of the
target is not the size of the vector.  Note that the current 
implementation inside the vectorizer has a notion of
aligned == aligned to vector size so I think you are fine here.

As you are converting the masked load to a conditional load the
target may have support for that and thus you could avoid the
scratch memory in that case (you can also avoid creating more than
one scratch memory per vectorized function - all vectors are of the
same size).  Not sure if there is any target where this applies to
though.  IIRC AMD XOP had a real vector "cond-expr" that allowed
a memory operand in one of the arms (but I don't remember whether
the specification said anything about not performing the load
if the mask specifies all bits come from the other operand).

Other than that, a minor comment on the patch below...

> Thanks,
> James
> 
> > diff --git a/gcc/optabs-query.h b/gcc/optabs-query.h
> > index 73f2729..066d133 100644
> > --- a/gcc/optabs-query.h
> > +++ b/gcc/optabs-query.h
> > @@ -134,5 +134,6 @@ bool can_vec_mask_load_store_p (machine_mode, bool);
> >  bool can_compare_and_swap_p (machine_mode, bool);
> >  bool can_atomic_exchange_p (machine_mode, bool);
> >  bool lshift_cheap_p (bool);
> > +bool supports_umax_reduction ();
> >  
> >  #endif
> > diff --git a/gcc/optabs-query.c b/gcc/optabs-query.c
> > index 254089f..23a85a4 100644
> > --- a/gcc/optabs-query.c
> > +++ b/gcc/optabs-query.c
> > @@ -463,6 +463,21 @@ can_mult_highpart_p (machine_mode mode, bool uns_p)
> >    return 0;
> >  }
> >  
> > +/* Return true if target supports unsigned max reduction for any mode.  */
> > +
> > +bool
> > +supports_umax_reduction ()
> > +{
> > +  machine_mode mode;
> > +
> > +  for (mode = MIN_MODE_VECTOR_INT; mode <= MAX_MODE_VECTOR_INT;
> > +       mode = (machine_mode) (mode + 1))
> > +    if (optab_handler (reduc_umax_scal_optab, mode) != CODE_FOR_nothing)
> > +      return true;
> > +
> > +  return false;
> > +}
> > +
> >  /* Return true if target supports vector masked load/store for mode.  */
> >  
> >  bool
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-4.c b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
> > new file mode 100644
> > index 0000000..98db8e3
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-align-4.c
> > @@ -0,0 +1,65 @@
> > +/* { dg-require-effective-target umax_reduction } */
> > +
> > +#define N 512
> > +#define K 32
> > +
> > +extern void abort (void) __attribute__((noreturn));
> > +
> > +int a[N] __attribute__ ((aligned (16)));
> > +int b[N] __attribute__ ((aligned (16)));
> > +int c[N] __attribute__ ((aligned (16)));
> > +
> > +__attribute__ ((noinline)) void
> > +init_arrays () {
> > +  int i;
> > +
> > +  for (i = 0; i < N / 4; ++i)
> > +    a[i] = K + 1;
> > +
> > +  for (i = N / 4; i < N / 2; ++i)
> > +    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
> > +
> > +  for (i = N / 2; i < N; ++i)
> > +    a[i] = K - 1;
> > +
> > +  for (i = 0; i < N; ++i)
> > +    b[i] = i;
> > +}
> > +
> > +__attribute__ ((noinline)) void
> > +check_array () {
> > +  int i = 0;
> > +
> > +  for (i = 0; i < N / 4; ++i)
> > +    if (c[i] != a[i])
> > +      abort ();
> > +
> > +  for (i = N / 4; i < N / 2; ++i)
> > +    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
> > +      abort ();
> > +
> > +  for (i = N / 2; i < N; ++i)
> > +    if (c[i] != b[i])
> > +      abort ();
> > +}
> > +
> > +__attribute__ ((noinline)) void
> > +main1 (int* bp) {
> > +  int i;
> > +
> > +  for (i = 0; i < N; ++i)
> > +    c[i] = a[i] < K ? bp[i] : a[i];
> > +
> > +  check_array ();
> > +}
> > +
> > +int main (void) {
> > +  init_arrays ();
> > +
> > +  main1 (b);
> > +
> > +  return 0;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> > +
> > diff --git a/gcc/testsuite/gcc.dg/vect/vect-align-5.c b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
> > new file mode 100644
> > index 0000000..93bfaa1
> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/vect/vect-align-5.c
> > @@ -0,0 +1,65 @@
> > +/* { dg-require-effective-target umax_reduction } */
> > +
> > +#define N 512
> > +#define K 32
> > +
> > +extern void abort (void) __attribute__((noreturn));
> > +
> > +int a[N] __attribute__ ((aligned (16)));
> > +int b[N];
> > +int c[N] __attribute__ ((aligned (16)));
> > +
> > +__attribute__ ((noinline)) void
> > +init_arrays () {
> > +  int i;
> > +
> > +  for (i = 0; i < N / 4; ++i)
> > +    a[i] = K + 1;
> > +
> > +  for (i = N / 4; i < N / 2; ++i)
> > +    a[i] = (i % 2 == 0) ? K - 1 : K + 1;
> > +
> > +  for (i = N / 2; i < N; ++i)
> > +    a[i] = K - 1;
> > +
> > +  for (i = 0; i < N; ++i)
> > +    b[i] = i;
> > +}
> > +
> > +__attribute__ ((noinline)) void
> > +check_array () {
> > +  int i = 0;
> > +
> > +  for (i = 0; i < N / 4; ++i)
> > +    if (c[i] != a[i])
> > +      abort ();
> > +
> > +  for (i = N / 4; i < N / 2; ++i)
> > +    if (c[i] != (i % 2 == 0) ? b[i] : a[i])
> > +      abort ();
> > +
> > +  for (i = N / 2; i < N; ++i)
> > +    if (c[i] != b[i])
> > +      abort ();
> > +}
> > +
> > +__attribute__ ((noinline)) void
> > +main1 (int* bp) {
> > +  int i;
> > +
> > +  for (i = 0; i < N; ++i)
> > +    c[i] = a[i] < K ? bp[i] : a[i];
> > +
> > +  check_array ();
> > +}
> > +
> > +int main (void) {
> > +  init_arrays ();
> > +
> > +  main1 (b);
> > +
> > +  return 0;
> > +}
> > +
> > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
> > +
> > diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp
> > index a465eb1..9b1c338 100644
> > --- a/gcc/testsuite/lib/target-supports.exp
> > +++ b/gcc/testsuite/lib/target-supports.exp
> > @@ -6449,3 +6449,14 @@ proc check_effective_target_comdat_group {} {
> >  	int (*fn) () = foo;
> >      }]
> >  }
> > +
> > +# Return 1 if the target supports unsigned max vector reduction.
> > +
> > +proc check_effective_target_umax_reduction { } {
> > +    if { [istarget aarch64*-*-*] } {
> > +	return 1;
> > +    } else {
> > +	return 0;
> > +    }
> > +}
> > +
> > diff --git a/gcc/tree-if-conv.c b/gcc/tree-if-conv.c
> > index 0987884..4f84705 100644
> > --- a/gcc/tree-if-conv.c
> > +++ b/gcc/tree-if-conv.c
> > @@ -811,7 +811,8 @@ ifcvt_can_use_mask_load_store (gimple stmt)
> >        || VECTOR_MODE_P (mode))
> >      return false;
> >  
> > -  if (can_vec_mask_load_store_p (mode, is_load))
> > +  if (can_vec_mask_load_store_p (mode, is_load)
> > +      || (is_load && supports_umax_reduction ()))
> >      return true;
> >  
> >    return false;
> > diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c
> > index 671e613..4f8c2c5 100644
> > --- a/gcc/tree-vect-data-refs.c
> > +++ b/gcc/tree-vect-data-refs.c
> > @@ -5749,10 +5749,19 @@ vect_supportable_dr_alignment (struct data_reference *dr,
> >    /* For now assume all conditional loads/stores support unaligned
> >       access without any special code.  */
> >    if (is_gimple_call (stmt)
> > -      && gimple_call_internal_p (stmt)
> > -      && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
> > -	  || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
> > -    return dr_unaligned_supported;
> > +      && gimple_call_internal_p (stmt))
> > +    {
> > +      if (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD)
> > +	return (can_vec_mask_load_store_p (mode, true)
> > +		? dr_unaligned_supported
> > +		: dr_unaligned_unsupported);
> > +      else if (gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
> > +	{
> > +	  gcc_checking_assert (can_vec_mask_load_store_p (
> > +				TYPE_MODE (TREE_TYPE (vectype)), false));
> > +	  return dr_unaligned_supported;
> > +	}
> > +    }
> >  
> >    if (loop_vinfo)
> >      {
> > diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
> > index d4a436d..2a8c231 100644
> > --- a/gcc/tree-vect-stmts.c
> > +++ b/gcc/tree-vect-stmts.c
> > @@ -1840,7 +1840,9 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
> >  				 : DR_STEP (dr), size_zero_node) <= 0)
> >      return false;
> >    else if (!VECTOR_MODE_P (TYPE_MODE (vectype))
> > -	   || !can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
> > +	   || !(can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store)
> > +		|| (optab_handler (reduc_umax_scal_optab,
> > +				   TYPE_MODE (vectype)) != CODE_FOR_nothing)))
> >      return false;
> >  
> >    if (TREE_CODE (mask) != SSA_NAME)
> > @@ -2140,12 +2142,43 @@ vectorizable_mask_load_store (gimple stmt, gimple_stmt_iterator *gsi,
> >  	    misalign = DR_MISALIGNMENT (dr);
> >  	  set_ptr_info_alignment (get_ptr_info (dataref_ptr), align,
> >  				  misalign);
> > -	  new_stmt
> > -	    = gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
> > -					  gimple_call_arg (stmt, 1),
> > -					  vec_mask);
> > -	  gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
> > -	  vect_finish_stmt_generation (stmt, new_stmt, gsi);
> > +
> > +	  if (can_vec_mask_load_store_p (TYPE_MODE (vectype), !is_store))
> > +	    {
> > +	      new_stmt
> > +		= gimple_build_call_internal (IFN_MASK_LOAD, 3, dataref_ptr,
> > +					      gimple_call_arg (stmt, 1),
> > +					      vec_mask);
> > +	      gimple_call_set_lhs (new_stmt, make_ssa_name (vec_dest));
> > +	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
> > +	    }
> > +	  else
> > +	    {
> > +	      tree anytrue = make_temp_ssa_name (TREE_TYPE (
> > +						  TREE_TYPE (vec_mask)),
> > +						 NULL, "_anytrue");
> > +	      tree reduction = build1 (REDUC_MAX_EXPR, TREE_TYPE (anytrue),
> > +				       vec_mask);
> > +	      gimple anytrue_init = gimple_build_assign (anytrue, reduction);
> > +	      vect_finish_stmt_generation (stmt, anytrue_init, gsi);
> > +
> > +	      tree temp_addr = build1 (ADDR_EXPR, TREE_TYPE (dataref_ptr),
> > +				       create_tmp_var (vectype, "safevec"));
> > +	      tree vec_cond_expr = build3 (COND_EXPR, vectype, anytrue,
> > +					   dataref_ptr, temp_addr);
> > +
> > +	      tree safeb = make_temp_ssa_name (TREE_TYPE (dataref_ptr),
> > +					       NULL, "_safeb");
> > +	      gimple safeb_init = gimple_build_assign (safeb, vec_cond_expr);

please use gimple_build_assign with the operand form, avoiding
build1 (REDUC_MAX_EXPR and build3 (COND_EXPR

> > +	      vect_finish_stmt_generation (stmt, safeb_init, gsi);
> > +
> > +	      tree load = build2 (MEM_REF, vectype, safeb,
> > +				  build_int_cst (ptr_type_node, 0));

so you are emitting loads with alias-set zero.  That's bad for
scheduling and later optimization.  See how to do better
by looking at how 'alias_off' is computed in vectorizable_load.

Richard.

> > +	      new_stmt
> > +		= gimple_build_assign (make_ssa_name (vec_dest), load);
> > +	      vect_finish_stmt_generation (stmt, new_stmt, gsi);
> > +	    }
> > +
> >  	  if (i == 0)
> >  	    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
> >  	  else
> 
> 

-- 
Richard Biener <rguenther@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 21284 (AG Nuernberg)

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2015-09-18 11:49 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-18 10:41 [PATCH][AARCH64] Emulating aligned mask loads on AArch64 Pawel Kupidura
2015-09-18 11:00 ` James Greenhalgh
2015-09-18 12:04   ` Richard Biener

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).