public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Detect a pack-unpack pattern in GCC vectorizer and optimize it.
@ 2014-04-24 17:27 Cong Hou
  2014-04-28 11:17 ` Richard Biener
  0 siblings, 1 reply; 6+ messages in thread
From: Cong Hou @ 2014-04-24 17:27 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Biener

[-- Attachment #1: Type: text/plain, Size: 1175 bytes --]

Given the following loop:

int a[N];
short b[N*2];

for (int i = 0; i < N; ++i)
  a[i] = b[i*2];


After being vectorized, the access to b[i*2] will be compiled into
several packing statements, while the type promotion from short to int
will be compiled into several unpacking statements. With this patch,
each pair of pack/unpack statements will be replaced by less expensive
statements (with shift or bit-and operations).

On x86_64, the loop above will be compiled into the following assembly
(with -O2 -ftree-vectorize):

movdqu 0x10(%rcx),%xmm3
movdqu -0x20(%rcx),%xmm0
movdqa %xmm0,%xmm2
punpcklwd %xmm3,%xmm0
punpckhwd %xmm3,%xmm2
movdqa %xmm0,%xmm3
punpcklwd %xmm2,%xmm0
punpckhwd %xmm2,%xmm3
movdqa %xmm1,%xmm2
punpcklwd %xmm3,%xmm0
pcmpgtw %xmm0,%xmm2
movdqa %xmm0,%xmm3
punpckhwd %xmm2,%xmm0
punpcklwd %xmm2,%xmm3
movups %xmm0,-0x10(%rdx)
movups %xmm3,-0x20(%rdx)


With this patch, the generated assembly is shown below:

movdqu 0x10(%rcx),%xmm0
movdqu -0x20(%rcx),%xmm1
pslld  $0x10,%xmm0
psrad  $0x10,%xmm0
pslld  $0x10,%xmm1
movups %xmm0,-0x10(%rdx)
psrad  $0x10,%xmm1
movups %xmm1,-0x20(%rdx)


Bootstrapped and tested on x86-64. OK for trunk?


thanks,
Cong

[-- Attachment #2: patch-pack-unpack-pattern.txt --]
[-- Type: text/plain, Size: 10198 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 117cdd0..e7143f1 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2014-04-23  Cong Hou  <congh@google.com>
+
+	* tree-vect-stmts.c (detect_pack_unpack_pattern): New function.
+	(vect_gen_widened_results_half): Call detect_pack_unpack_pattern.
+
 2014-04-23  David Malcolm  <dmalcolm@redhat.com>
 
 	* is-a.h: Update comments to reflect the following changes to the
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 62b07f4..a8755b3 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2014-04-23  Cong Hou  <congh@google.com>
+
+	* gcc.dg/vect/vect-125.c: New test.
+
 2014-04-23  Jeff Law  <law@redhat.com>
 
 	PR tree-optimization/60902
diff --git a/gcc/testsuite/gcc.dg/vect/vect-125.c b/gcc/testsuite/gcc.dg/vect/vect-125.c
new file mode 100644
index 0000000..988dea6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-125.c
@@ -0,0 +1,122 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <limits.h>
+#include "tree-vect.h"
+
+#define N 64
+
+char b[N];
+unsigned char c[N];
+short d[N];
+unsigned short e[N];
+
+__attribute__((noinline)) void
+test1 ()
+{
+  int a[N];
+  int i;
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = b[i*2];
+      a[i+N/2] = b[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != b[i*2] || a[i+N/2] != b[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = c[i*2];
+      a[i+N/2] = c[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != c[i*2] || a[i+N/2] != c[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = d[i*2];
+      a[i+N/2] = d[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != d[i*2] || a[i+N/2] != d[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = e[i*2];
+      a[i+N/2] = e[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != e[i*2] || a[i+N/2] != e[i*2+1])
+      abort ();
+}
+
+__attribute__((noinline)) void
+test2 ()
+{
+  unsigned int a[N];
+  int i;
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = b[i*2];
+      a[i+N/2] = b[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != b[i*2] || a[i+N/2] != b[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = c[i*2];
+      a[i+N/2] = c[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != c[i*2] || a[i+N/2] != c[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = d[i*2];
+      a[i+N/2] = d[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != d[i*2] || a[i+N/2] != d[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = e[i*2];
+      a[i+N/2] = e[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != e[i*2] || a[i+N/2] != e[i*2+1])
+      abort ();
+}
+
+int
+main ()
+{
+  b[0] = CHAR_MIN;
+  c[0] = UCHAR_MAX;
+  d[0] = SHRT_MIN;
+  e[0] = USHRT_MAX;
+
+  int i;
+  for (i = 1; i < N; i++)
+    {
+      b[i] = b[i-1] + 1;
+      c[i] = c[i-1] - 1;
+      d[i] = d[i-1] + 1;
+      e[i] = e[i-1] - 1;
+    }
+
+  test1 ();
+  test2 ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "A pack-unpack pattern is recognized" 32 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 1a51d6d..d0cf1f4 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -3191,6 +3191,174 @@ vectorizable_simd_clone_call (gimple stmt, gimple_stmt_iterator *gsi,
 }
 
 
+/* Function detect_pack_unpack_pattern
+
+   Detect the following pattern:
+
+   S1  vect3 = VEC_PERM_EXPR <vect1, vect2, { 0, 2, 4, ... }>;
+   or
+   S1  vect3 = VEC_PERM_EXPR <vect1, vect2, { 1, 3, 5, ... }>;
+
+   S2  vect4 = [vec_unpack_lo_expr] vect3;
+   or/and
+   S3  vect5 = [vec_unpack_hi_expr] vect3;
+
+   S1 is usually generated from accessing even/odd elements of an array
+   and S2/S3 are generated from type promotion.  In this case S1 and S2/S3 can
+   be replaced by less expensive statements.  */
+
+static gimple
+detect_pack_unpack_pattern (enum tree_code code, tree vec_oprnd, int op_type,
+			    tree vec_dest, gimple_stmt_iterator *gsi,
+			    gimple stmt)
+{
+  if ((code != VEC_UNPACK_LO_EXPR && code != VEC_UNPACK_HI_EXPR)
+      || op_type != unary_op)
+    return NULL;
+
+  if (TREE_CODE (vec_oprnd) != SSA_NAME)
+    return NULL;
+
+  gimple def_stmt = SSA_NAME_DEF_STMT (vec_oprnd);
+  if (!is_gimple_assign (def_stmt))
+    return NULL;
+  enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
+  if (def_code != VEC_PERM_EXPR)
+    return NULL;
+
+  bool unpack_lo = (code == VEC_UNPACK_LO_EXPR);
+  tree perm_vec = gimple_assign_rhs3 (def_stmt);
+  gcc_assert (TREE_CODE (perm_vec) == VECTOR_CST);
+
+  /* Check if VEC_PERM_EXPR is generated from accessing even/odd elements of
+     an array.  */
+  int nelts = VECTOR_CST_NELTS (perm_vec);
+  int odd_num = 0;
+  for (int i = 0; i < nelts / 2; i++)
+    {
+      tree elt = VECTOR_CST_ELT (perm_vec,
+				 unpack_lo ? i : i + nelts / 2);
+      int elt_val = (HOST_WIDE_INT) TREE_INT_CST_LOW (elt);
+      if (!unpack_lo)
+	elt_val -= nelts;
+
+      if (elt_val / 2 != i)
+	return NULL;
+      odd_num += elt_val & 1;
+    }
+
+  bool even_elem = false;
+  if (odd_num == 0)
+    even_elem = true;
+  else if (odd_num != nelts / 2)
+    return NULL;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "A pack-unpack pattern is recognized.\n");
+
+  tree vec_oprnd0 = unpack_lo ? gimple_assign_rhs1 (def_stmt)
+			      : gimple_assign_rhs2 (def_stmt);
+  tree vec_type = TREE_TYPE (vec_oprnd0);
+  tree scalar_type = lang_hooks.types.type_for_mode (
+		       GET_MODE_INNER (TYPE_MODE (vec_type)),
+		       TYPE_UNSIGNED (vec_type));
+  tree vectype_out = TREE_TYPE (vec_dest);
+
+  if (even_elem)
+    {
+      /* If even elements are packed, and if the original values are unsigned,
+	 build a bit_and statement to replace the pack-unpack statements.
+	 If the original values are signed, build a lshift statement and a
+	 rshift statement to replace the pack-unpack statements.  */
+
+      if (TYPE_UNSIGNED (vec_type))
+	{
+	  tree *mask = XALLOCAVEC (tree, nelts);
+	  unsigned HOST_WIDE_INT max_val = TREE_INT_CST_LOW (
+					     TYPE_MAX_VALUE (scalar_type));
+	  for (int k = 0; k < nelts; ++k)
+	    mask[k] = build_int_cst (scalar_type, (k & 1) ? 0 : max_val);
+	  tree vec_oprnd1 = build_vector (TREE_TYPE (vec_oprnd0), mask);
+	  tree temp = make_temp_ssa_name (vec_type, NULL, "vect_temp");
+
+	  gimple new_stmt1, new_stmt2;
+	  new_stmt1 = gimple_build_assign_with_ops (BIT_AND_EXPR, temp,
+						    vec_oprnd0, vec_oprnd1);
+	  new_stmt2 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+						    temp, NULL_TREE);
+	  gimple_assign_set_lhs (new_stmt2,
+				 make_ssa_name (vec_dest, new_stmt2));
+
+	  vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+
+	  return new_stmt2;
+	}
+      else
+	{
+	  tree scalar_itype = lang_hooks.types.type_for_mode (
+				GET_MODE_INNER (TYPE_MODE (vectype_out)), 0);
+	  tree vecitype = get_vectype_for_scalar_type (scalar_itype);
+	  tree shift_oprnd = build_int_cst (scalar_type,
+					    TYPE_PRECISION (scalar_type));
+
+	  tree temp1 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+	  tree temp2 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+	  tree temp3 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+
+	  gimple new_stmt1, new_stmt2, new_stmt3, new_stmt4;
+	  new_stmt1 = gimple_build_assign_with_ops (NOP_EXPR, temp1,
+						    vec_oprnd0, NULL);
+	  new_stmt2 = gimple_build_assign_with_ops (LSHIFT_EXPR, temp2,
+						    temp1, shift_oprnd);
+	  new_stmt3 = gimple_build_assign_with_ops (RSHIFT_EXPR, temp3,
+						    temp2, shift_oprnd);
+	  new_stmt4 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+						    temp3, NULL);
+	  gimple_assign_set_lhs (new_stmt4,
+				 make_ssa_name (vec_dest, new_stmt4));
+
+	  vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt3, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt4, gsi);
+
+	  return new_stmt4;
+	}
+    }
+  else
+    {
+      /* If odd elements are packed, build a rshift statement to replace
+	 the pack-unpack statements.  */
+
+      gimple new_stmt1, new_stmt2, new_stmt3;
+      tree scalar_itype = lang_hooks.types.type_for_mode (
+			    GET_MODE_INNER (TYPE_MODE (vectype_out)),
+			    TYPE_UNSIGNED (scalar_type));
+      tree vecitype = get_vectype_for_scalar_type (scalar_itype);
+      tree temp1 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+      tree temp2 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+      tree shift_oprnd = build_int_cst (scalar_type,
+					TYPE_PRECISION (scalar_type));
+
+      new_stmt1 = gimple_build_assign_with_ops (NOP_EXPR, temp1,
+						vec_oprnd0, NULL);
+      new_stmt2 = gimple_build_assign_with_ops (RSHIFT_EXPR, temp2,
+						temp1, shift_oprnd);
+      new_stmt3 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+						temp2, NULL);
+      gimple_assign_set_lhs (new_stmt3,
+			     make_ssa_name (vec_dest, new_stmt2));
+
+      vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+      vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+      vect_finish_stmt_generation (stmt, new_stmt3, gsi);
+
+      return new_stmt3;
+    }
+}
+
 /* Function vect_gen_widened_results_half
 
    Create a vector stmt whose code, type, number of arguments, and result
@@ -3223,10 +3391,15 @@ vect_gen_widened_results_half (enum tree_code code,
     }
   else
     {
+      if ((new_stmt = detect_pack_unpack_pattern (
+			  code, vec_oprnd0, op_type, vec_dest, gsi, stmt)))
+        return new_stmt;
+
       /* Generic support */
       gcc_assert (op_type == TREE_CODE_LENGTH (code));
       if (op_type != binary_op)
 	vec_oprnd1 = NULL;
+
       new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
 					       vec_oprnd1);
       new_temp = make_ssa_name (vec_dest, new_stmt);

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2014-11-21 11:50 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-24 17:27 [PATCH] Detect a pack-unpack pattern in GCC vectorizer and optimize it Cong Hou
2014-04-28 11:17 ` Richard Biener
2014-05-03  0:39   ` Cong Hou
2014-06-24 11:05     ` Richard Biener
2014-06-25 17:34       ` Cong Hou
2014-11-21 12:03         ` Evgeny Stupachenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).