public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: James Greenhalgh <james.greenhalgh@arm.com>
To: <gcc-patches@gcc.gnu.org>
Cc: <nd@arm.com>, <ramana.radhakrishnan@arm.com>,
	<marcus.shawcroft@arm.com>,	<richard.earnshaw@arm.com>
Subject: [Patch AArch64] GCC 6 regression in vector performance. - Fix vector initialization to happen with lane load instructions.
Date: Wed, 20 Jan 2016 15:22:00 -0000	[thread overview]
Message-ID: <1453303331-14492-1-git-send-email-james.greenhalgh@arm.com> (raw)

[-- Attachment #1: Type: text/plain, Size: 1313 bytes --]


Hi,

In a number of cases where we try to create vectors we end up spilling to the
stack and then filling. This is one example distilled from a couple of
micro-benchmrks where the issue shows up. The reason for the extra cost
in this case is the unnecessary use of the stack. The patch attempts to
finesse this by using lane loads or vector inserts to produce the right
results.

This patch is mostly Ramana's work, I've just cleaned it up a little.

This has been in a number of our trees lately, and we haven't seen any
regressions. I've also bootstrapped and tested it, and run a set of
benchmarks to show no regressions on Cortex-A57 or Cortex-A53.

The patch fixes some regressions caused by the more agressive vectorization
in GCC6, so I'd like to propose it to go in even though we are in Stage 4.

OK?

Thanks,
James

---
gcc/

2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>
	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

	* config/aarch64/aarch64.c (aarch64_expand_vector_init): Refactor,
	always use lane loads to construct non-constant vectors.

gcc/testsuite/

2016-01-20  James Greenhalgh  <james.greenhalgh@arm.com>
	    Ramana Radhakrishnan  <ramana.radhakrishnan@arm.com>

	* gcc.target/aarch64/vector_initialization_nostack.c: New.


[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #2: 0001-Patch-AArch64-GCC-6-regression-in-vector-performance.patch --]
[-- Type: text/x-patch;  name=0001-Patch-AArch64-GCC-6-regression-in-vector-performance.patch, Size: 4809 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 03bc1b9..3787b38 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -10985,28 +10985,37 @@ aarch64_simd_make_constant (rtx vals)
     return NULL_RTX;
 }
 
+/* Expand a vector initialisation sequence, such that TARGET is
+   initialised to contain VALS.  */
+
 void
 aarch64_expand_vector_init (rtx target, rtx vals)
 {
   machine_mode mode = GET_MODE (target);
   machine_mode inner_mode = GET_MODE_INNER (mode);
+  /* The number of vector elements.  */
   int n_elts = GET_MODE_NUNITS (mode);
+  /* The number of vector elements which are not constant.  */
   int n_var = 0;
   rtx any_const = NULL_RTX;
+  /* The first element of vals.  */
+  rtx v0 = XVECEXP (vals, 0, 0);
   bool all_same = true;
 
+  /* Count the number of variable elements to initialise.  */
   for (int i = 0; i < n_elts; ++i)
     {
       rtx x = XVECEXP (vals, 0, i);
-      if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
+      if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
 	++n_var;
       else
 	any_const = x;
 
-      if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
-	all_same = false;
+      all_same &= rtx_equal_p (x, v0);
     }
 
+  /* No variable elements, hand off to aarch64_simd_make_constant which knows
+     how best to handle this.  */
   if (n_var == 0)
     {
       rtx constant = aarch64_simd_make_constant (vals);
@@ -11020,14 +11029,15 @@ aarch64_expand_vector_init (rtx target, rtx vals)
   /* Splat a single non-constant element if we can.  */
   if (all_same)
     {
-      rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
+      rtx x = copy_to_mode_reg (inner_mode, v0);
       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
       return;
     }
 
-  /* Half the fields (or less) are non-constant.  Load constant then overwrite
-     varying fields.  Hope that this is more efficient than using the stack.  */
-  if (n_var <= n_elts/2)
+  /* Initialise a vector which is part-variable.  We want to first try
+     to build those lanes which are constant in the most efficient way we
+     can.  */
+  if (n_var != n_elts)
     {
       rtx copy = copy_rtx (vals);
 
@@ -11054,31 +11064,21 @@ aarch64_expand_vector_init (rtx target, rtx vals)
 	  XVECEXP (copy, 0, i) = subst;
 	}
       aarch64_expand_vector_init (target, copy);
+    }
 
-      /* Insert variables.  */
-      enum insn_code icode = optab_handler (vec_set_optab, mode);
-      gcc_assert (icode != CODE_FOR_nothing);
+  /* Insert the variable lanes directly.  */
 
-      for (int i = 0; i < n_elts; i++)
-	{
-	  rtx x = XVECEXP (vals, 0, i);
-	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
-	    continue;
-	  x = copy_to_mode_reg (inner_mode, x);
-	  emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
-	}
-      return;
-    }
+  enum insn_code icode = optab_handler (vec_set_optab, mode);
+  gcc_assert (icode != CODE_FOR_nothing);
 
-  /* Construct the vector in memory one field at a time
-     and load the whole vector.  */
-  rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
   for (int i = 0; i < n_elts; i++)
-    emit_move_insn (adjust_address_nv (mem, inner_mode,
-				    i * GET_MODE_SIZE (inner_mode)),
-		    XVECEXP (vals, 0, i));
-  emit_move_insn (target, mem);
-
+    {
+      rtx x = XVECEXP (vals, 0, i);
+      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+	continue;
+      x = copy_to_mode_reg (inner_mode, x);
+      emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
+    }
 }
 
 static unsigned HOST_WIDE_INT
diff --git a/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
new file mode 100644
index 0000000..bbad04d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vector_initialization_nostack.c
@@ -0,0 +1,53 @@
+/* { dg-do compile } */
+/* { dg-options "-O3 -ftree-vectorize -fno-vect-cost-model" } */
+float arr_f[100][100];
+float
+f9 (void)
+{
+
+  int i;
+  float sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr_f[i][0] * arr_f[0][i];
+  return sum;
+
+}
+
+
+int arr[100][100];
+int
+f10 (void)
+{
+
+  int i;
+  int sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr[i][0] * arr[0][i];
+  return sum;
+
+}
+
+double arr_d[100][100];
+double
+f11 (void)
+{
+  int i;
+  double sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr_d[i][0] * arr_d[0][i];
+  return sum;
+}
+
+char arr_c[100][100];
+char
+f12 (void)
+{
+  int i;
+  char sum = 0;
+  for (i = 0; i < 100; i++)
+    sum += arr_c[i][0] * arr_c[0][i];
+  return sum;
+}
+
+
+/* { dg-final { scan-assembler-not "sp" } } */

             reply	other threads:[~2016-01-20 15:22 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-01-20 15:22 James Greenhalgh [this message]
2016-02-02 10:29 ` James Greenhalgh
2016-02-08 10:56   ` James Greenhalgh
2016-02-15 10:49     ` James Greenhalgh
2016-02-16  8:42 ` Marcus Shawcroft

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1453303331-14492-1-git-send-email-james.greenhalgh@arm.com \
    --to=james.greenhalgh@arm.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=marcus.shawcroft@arm.com \
    --cc=nd@arm.com \
    --cc=ramana.radhakrishnan@arm.com \
    --cc=richard.earnshaw@arm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).