public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Detect a pack-unpack pattern in GCC vectorizer and optimize it.
@ 2014-04-24 17:27 Cong Hou
  2014-04-28 11:17 ` Richard Biener
  0 siblings, 1 reply; 6+ messages in thread
From: Cong Hou @ 2014-04-24 17:27 UTC (permalink / raw)
  To: GCC Patches; +Cc: Richard Biener

[-- Attachment #1: Type: text/plain, Size: 1175 bytes --]

Given the following loop:

int a[N];
short b[N*2];

for (int i = 0; i < N; ++i)
  a[i] = b[i*2];


After being vectorized, the access to b[i*2] will be compiled into
several packing statements, while the type promotion from short to int
will be compiled into several unpacking statements. With this patch,
each pair of pack/unpack statements will be replaced by less expensive
statements (with shift or bit-and operations).

On x86_64, the loop above will be compiled into the following assembly
(with -O2 -ftree-vectorize):

movdqu 0x10(%rcx),%xmm3
movdqu -0x20(%rcx),%xmm0
movdqa %xmm0,%xmm2
punpcklwd %xmm3,%xmm0
punpckhwd %xmm3,%xmm2
movdqa %xmm0,%xmm3
punpcklwd %xmm2,%xmm0
punpckhwd %xmm2,%xmm3
movdqa %xmm1,%xmm2
punpcklwd %xmm3,%xmm0
pcmpgtw %xmm0,%xmm2
movdqa %xmm0,%xmm3
punpckhwd %xmm2,%xmm0
punpcklwd %xmm2,%xmm3
movups %xmm0,-0x10(%rdx)
movups %xmm3,-0x20(%rdx)


With this patch, the generated assembly is shown below:

movdqu 0x10(%rcx),%xmm0
movdqu -0x20(%rcx),%xmm1
pslld  $0x10,%xmm0
psrad  $0x10,%xmm0
pslld  $0x10,%xmm1
movups %xmm0,-0x10(%rdx)
psrad  $0x10,%xmm1
movups %xmm1,-0x20(%rdx)


Bootstrapped and tested on x86-64. OK for trunk?


thanks,
Cong

[-- Attachment #2: patch-pack-unpack-pattern.txt --]
[-- Type: text/plain, Size: 10198 bytes --]

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 117cdd0..e7143f1 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,8 @@
+2014-04-23  Cong Hou  <congh@google.com>
+
+	* tree-vect-stmts.c (detect_pack_unpack_pattern): New function.
+	(vect_gen_widened_results_half): Call detect_pack_unpack_pattern.
+
 2014-04-23  David Malcolm  <dmalcolm@redhat.com>
 
 	* is-a.h: Update comments to reflect the following changes to the
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 62b07f4..a8755b3 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,7 @@
+2014-04-23  Cong Hou  <congh@google.com>
+
+	* gcc.dg/vect/vect-125.c: New test.
+
 2014-04-23  Jeff Law  <law@redhat.com>
 
 	PR tree-optimization/60902
diff --git a/gcc/testsuite/gcc.dg/vect/vect-125.c b/gcc/testsuite/gcc.dg/vect/vect-125.c
new file mode 100644
index 0000000..988dea6
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-125.c
@@ -0,0 +1,122 @@
+/* { dg-require-effective-target vect_int } */
+
+#include <limits.h>
+#include "tree-vect.h"
+
+#define N 64
+
+char b[N];
+unsigned char c[N];
+short d[N];
+unsigned short e[N];
+
+__attribute__((noinline)) void
+test1 ()
+{
+  int a[N];
+  int i;
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = b[i*2];
+      a[i+N/2] = b[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != b[i*2] || a[i+N/2] != b[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = c[i*2];
+      a[i+N/2] = c[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != c[i*2] || a[i+N/2] != c[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = d[i*2];
+      a[i+N/2] = d[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != d[i*2] || a[i+N/2] != d[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = e[i*2];
+      a[i+N/2] = e[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != e[i*2] || a[i+N/2] != e[i*2+1])
+      abort ();
+}
+
+__attribute__((noinline)) void
+test2 ()
+{
+  unsigned int a[N];
+  int i;
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = b[i*2];
+      a[i+N/2] = b[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != b[i*2] || a[i+N/2] != b[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = c[i*2];
+      a[i+N/2] = c[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != c[i*2] || a[i+N/2] != c[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = d[i*2];
+      a[i+N/2] = d[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != d[i*2] || a[i+N/2] != d[i*2+1])
+      abort ();
+
+  for (i = 0; i < N/2; i++)
+    {
+      a[i] = e[i*2];
+      a[i+N/2] = e[i*2+1];
+    }
+  for (i = 0; i < N/2; i++)
+    if (a[i] != e[i*2] || a[i+N/2] != e[i*2+1])
+      abort ();
+}
+
+int
+main ()
+{
+  b[0] = CHAR_MIN;
+  c[0] = UCHAR_MAX;
+  d[0] = SHRT_MIN;
+  e[0] = USHRT_MAX;
+
+  int i;
+  for (i = 1; i < N; i++)
+    {
+      b[i] = b[i-1] + 1;
+      c[i] = c[i-1] - 1;
+      d[i] = d[i-1] + 1;
+      e[i] = e[i-1] - 1;
+    }
+
+  test1 ();
+  test2 ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 4 loops" 2 "vect" } } */
+/* { dg-final { scan-tree-dump-times "A pack-unpack pattern is recognized" 32 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
+
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index 1a51d6d..d0cf1f4 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -3191,6 +3191,174 @@ vectorizable_simd_clone_call (gimple stmt, gimple_stmt_iterator *gsi,
 }
 
 
+/* Function detect_pack_unpack_pattern
+
+   Detect the following pattern:
+
+   S1  vect3 = VEC_PERM_EXPR <vect1, vect2, { 0, 2, 4, ... }>;
+   or
+   S1  vect3 = VEC_PERM_EXPR <vect1, vect2, { 1, 3, 5, ... }>;
+
+   S2  vect4 = [vec_unpack_lo_expr] vect3;
+   or/and
+   S3  vect5 = [vec_unpack_hi_expr] vect3;
+
+   S1 is usually generated from accessing even/odd elements of an array
+   and S2/S3 are generated from type promotion.  In this case S1 and S2/S3 can
+   be replaced by less expensive statements.  */
+
+static gimple
+detect_pack_unpack_pattern (enum tree_code code, tree vec_oprnd, int op_type,
+			    tree vec_dest, gimple_stmt_iterator *gsi,
+			    gimple stmt)
+{
+  if ((code != VEC_UNPACK_LO_EXPR && code != VEC_UNPACK_HI_EXPR)
+      || op_type != unary_op)
+    return NULL;
+
+  if (TREE_CODE (vec_oprnd) != SSA_NAME)
+    return NULL;
+
+  gimple def_stmt = SSA_NAME_DEF_STMT (vec_oprnd);
+  if (!is_gimple_assign (def_stmt))
+    return NULL;
+  enum tree_code def_code = gimple_assign_rhs_code (def_stmt);
+  if (def_code != VEC_PERM_EXPR)
+    return NULL;
+
+  bool unpack_lo = (code == VEC_UNPACK_LO_EXPR);
+  tree perm_vec = gimple_assign_rhs3 (def_stmt);
+  gcc_assert (TREE_CODE (perm_vec) == VECTOR_CST);
+
+  /* Check if VEC_PERM_EXPR is generated from accessing even/odd elements of
+     an array.  */
+  int nelts = VECTOR_CST_NELTS (perm_vec);
+  int odd_num = 0;
+  for (int i = 0; i < nelts / 2; i++)
+    {
+      tree elt = VECTOR_CST_ELT (perm_vec,
+				 unpack_lo ? i : i + nelts / 2);
+      int elt_val = (HOST_WIDE_INT) TREE_INT_CST_LOW (elt);
+      if (!unpack_lo)
+	elt_val -= nelts;
+
+      if (elt_val / 2 != i)
+	return NULL;
+      odd_num += elt_val & 1;
+    }
+
+  bool even_elem = false;
+  if (odd_num == 0)
+    even_elem = true;
+  else if (odd_num != nelts / 2)
+    return NULL;
+
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "A pack-unpack pattern is recognized.\n");
+
+  tree vec_oprnd0 = unpack_lo ? gimple_assign_rhs1 (def_stmt)
+			      : gimple_assign_rhs2 (def_stmt);
+  tree vec_type = TREE_TYPE (vec_oprnd0);
+  tree scalar_type = lang_hooks.types.type_for_mode (
+		       GET_MODE_INNER (TYPE_MODE (vec_type)),
+		       TYPE_UNSIGNED (vec_type));
+  tree vectype_out = TREE_TYPE (vec_dest);
+
+  if (even_elem)
+    {
+      /* If even elements are packed, and if the original values are unsigned,
+	 build a bit_and statement to replace the pack-unpack statements.
+	 If the original values are signed, build a lshift statement and a
+	 rshift statement to replace the pack-unpack statements.  */
+
+      if (TYPE_UNSIGNED (vec_type))
+	{
+	  tree *mask = XALLOCAVEC (tree, nelts);
+	  unsigned HOST_WIDE_INT max_val = TREE_INT_CST_LOW (
+					     TYPE_MAX_VALUE (scalar_type));
+	  for (int k = 0; k < nelts; ++k)
+	    mask[k] = build_int_cst (scalar_type, (k & 1) ? 0 : max_val);
+	  tree vec_oprnd1 = build_vector (TREE_TYPE (vec_oprnd0), mask);
+	  tree temp = make_temp_ssa_name (vec_type, NULL, "vect_temp");
+
+	  gimple new_stmt1, new_stmt2;
+	  new_stmt1 = gimple_build_assign_with_ops (BIT_AND_EXPR, temp,
+						    vec_oprnd0, vec_oprnd1);
+	  new_stmt2 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+						    temp, NULL_TREE);
+	  gimple_assign_set_lhs (new_stmt2,
+				 make_ssa_name (vec_dest, new_stmt2));
+
+	  vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+
+	  return new_stmt2;
+	}
+      else
+	{
+	  tree scalar_itype = lang_hooks.types.type_for_mode (
+				GET_MODE_INNER (TYPE_MODE (vectype_out)), 0);
+	  tree vecitype = get_vectype_for_scalar_type (scalar_itype);
+	  tree shift_oprnd = build_int_cst (scalar_type,
+					    TYPE_PRECISION (scalar_type));
+
+	  tree temp1 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+	  tree temp2 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+	  tree temp3 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+
+	  gimple new_stmt1, new_stmt2, new_stmt3, new_stmt4;
+	  new_stmt1 = gimple_build_assign_with_ops (NOP_EXPR, temp1,
+						    vec_oprnd0, NULL);
+	  new_stmt2 = gimple_build_assign_with_ops (LSHIFT_EXPR, temp2,
+						    temp1, shift_oprnd);
+	  new_stmt3 = gimple_build_assign_with_ops (RSHIFT_EXPR, temp3,
+						    temp2, shift_oprnd);
+	  new_stmt4 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+						    temp3, NULL);
+	  gimple_assign_set_lhs (new_stmt4,
+				 make_ssa_name (vec_dest, new_stmt4));
+
+	  vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt3, gsi);
+	  vect_finish_stmt_generation (stmt, new_stmt4, gsi);
+
+	  return new_stmt4;
+	}
+    }
+  else
+    {
+      /* If odd elements are packed, build a rshift statement to replace
+	 the pack-unpack statements.  */
+
+      gimple new_stmt1, new_stmt2, new_stmt3;
+      tree scalar_itype = lang_hooks.types.type_for_mode (
+			    GET_MODE_INNER (TYPE_MODE (vectype_out)),
+			    TYPE_UNSIGNED (scalar_type));
+      tree vecitype = get_vectype_for_scalar_type (scalar_itype);
+      tree temp1 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+      tree temp2 = make_temp_ssa_name (vecitype, NULL, "vect_temp");
+      tree shift_oprnd = build_int_cst (scalar_type,
+					TYPE_PRECISION (scalar_type));
+
+      new_stmt1 = gimple_build_assign_with_ops (NOP_EXPR, temp1,
+						vec_oprnd0, NULL);
+      new_stmt2 = gimple_build_assign_with_ops (RSHIFT_EXPR, temp2,
+						temp1, shift_oprnd);
+      new_stmt3 = gimple_build_assign_with_ops (NOP_EXPR, vec_dest,
+						temp2, NULL);
+      gimple_assign_set_lhs (new_stmt3,
+			     make_ssa_name (vec_dest, new_stmt2));
+
+      vect_finish_stmt_generation (stmt, new_stmt1, gsi);
+      vect_finish_stmt_generation (stmt, new_stmt2, gsi);
+      vect_finish_stmt_generation (stmt, new_stmt3, gsi);
+
+      return new_stmt3;
+    }
+}
+
 /* Function vect_gen_widened_results_half
 
    Create a vector stmt whose code, type, number of arguments, and result
@@ -3223,10 +3391,15 @@ vect_gen_widened_results_half (enum tree_code code,
     }
   else
     {
+      if ((new_stmt = detect_pack_unpack_pattern (
+			  code, vec_oprnd0, op_type, vec_dest, gsi, stmt)))
+        return new_stmt;
+
       /* Generic support */
       gcc_assert (op_type == TREE_CODE_LENGTH (code));
       if (op_type != binary_op)
 	vec_oprnd1 = NULL;
+
       new_stmt = gimple_build_assign_with_ops (code, vec_dest, vec_oprnd0,
 					       vec_oprnd1);
       new_temp = make_ssa_name (vec_dest, new_stmt);

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Detect a pack-unpack pattern in GCC vectorizer and optimize it.
  2014-04-24 17:27 [PATCH] Detect a pack-unpack pattern in GCC vectorizer and optimize it Cong Hou
@ 2014-04-28 11:17 ` Richard Biener
  2014-05-03  0:39   ` Cong Hou
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Biener @ 2014-04-28 11:17 UTC (permalink / raw)
  To: Cong Hou; +Cc: GCC Patches

On Thu, 24 Apr 2014, Cong Hou wrote:

> Given the following loop:
> 
> int a[N];
> short b[N*2];
> 
> for (int i = 0; i < N; ++i)
>   a[i] = b[i*2];
> 
> 
> After being vectorized, the access to b[i*2] will be compiled into
> several packing statements, while the type promotion from short to int
> will be compiled into several unpacking statements. With this patch,
> each pair of pack/unpack statements will be replaced by less expensive
> statements (with shift or bit-and operations).
> 
> On x86_64, the loop above will be compiled into the following assembly
> (with -O2 -ftree-vectorize):
> 
> movdqu 0x10(%rcx),%xmm3
> movdqu -0x20(%rcx),%xmm0
> movdqa %xmm0,%xmm2
> punpcklwd %xmm3,%xmm0
> punpckhwd %xmm3,%xmm2
> movdqa %xmm0,%xmm3
> punpcklwd %xmm2,%xmm0
> punpckhwd %xmm2,%xmm3
> movdqa %xmm1,%xmm2
> punpcklwd %xmm3,%xmm0
> pcmpgtw %xmm0,%xmm2
> movdqa %xmm0,%xmm3
> punpckhwd %xmm2,%xmm0
> punpcklwd %xmm2,%xmm3
> movups %xmm0,-0x10(%rdx)
> movups %xmm3,-0x20(%rdx)
> 
> 
> With this patch, the generated assembly is shown below:
> 
> movdqu 0x10(%rcx),%xmm0
> movdqu -0x20(%rcx),%xmm1
> pslld  $0x10,%xmm0
> psrad  $0x10,%xmm0
> pslld  $0x10,%xmm1
> movups %xmm0,-0x10(%rdx)
> psrad  $0x10,%xmm1
> movups %xmm1,-0x20(%rdx)
> 
> 
> Bootstrapped and tested on x86-64. OK for trunk?

This is an odd place to implement such transform.  Also if it
is faster or not depends on the exact ISA you target - for
example ppc has constraints on the maximum number of shifts
carried out in parallel and the above has 4 in very short
succession.  Esp. for the sign-extend path.

So this looks more like an opportunity for a post-vectorizer
transform on RTL or for the vectorizer special-casing
widening loads with a vectorizer pattern.

Richard.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Detect a pack-unpack pattern in GCC vectorizer and optimize it.
  2014-04-28 11:17 ` Richard Biener
@ 2014-05-03  0:39   ` Cong Hou
  2014-06-24 11:05     ` Richard Biener
  0 siblings, 1 reply; 6+ messages in thread
From: Cong Hou @ 2014-05-03  0:39 UTC (permalink / raw)
  To: Richard Biener; +Cc: GCC Patches

On Mon, Apr 28, 2014 at 4:04 AM, Richard Biener <rguenther@suse.de> wrote:
> On Thu, 24 Apr 2014, Cong Hou wrote:
>
>> Given the following loop:
>>
>> int a[N];
>> short b[N*2];
>>
>> for (int i = 0; i < N; ++i)
>>   a[i] = b[i*2];
>>
>>
>> After being vectorized, the access to b[i*2] will be compiled into
>> several packing statements, while the type promotion from short to int
>> will be compiled into several unpacking statements. With this patch,
>> each pair of pack/unpack statements will be replaced by less expensive
>> statements (with shift or bit-and operations).
>>
>> On x86_64, the loop above will be compiled into the following assembly
>> (with -O2 -ftree-vectorize):
>>
>> movdqu 0x10(%rcx),%xmm3
>> movdqu -0x20(%rcx),%xmm0
>> movdqa %xmm0,%xmm2
>> punpcklwd %xmm3,%xmm0
>> punpckhwd %xmm3,%xmm2
>> movdqa %xmm0,%xmm3
>> punpcklwd %xmm2,%xmm0
>> punpckhwd %xmm2,%xmm3
>> movdqa %xmm1,%xmm2
>> punpcklwd %xmm3,%xmm0
>> pcmpgtw %xmm0,%xmm2
>> movdqa %xmm0,%xmm3
>> punpckhwd %xmm2,%xmm0
>> punpcklwd %xmm2,%xmm3
>> movups %xmm0,-0x10(%rdx)
>> movups %xmm3,-0x20(%rdx)
>>
>>
>> With this patch, the generated assembly is shown below:
>>
>> movdqu 0x10(%rcx),%xmm0
>> movdqu -0x20(%rcx),%xmm1
>> pslld  $0x10,%xmm0
>> psrad  $0x10,%xmm0
>> pslld  $0x10,%xmm1
>> movups %xmm0,-0x10(%rdx)
>> psrad  $0x10,%xmm1
>> movups %xmm1,-0x20(%rdx)
>>
>>
>> Bootstrapped and tested on x86-64. OK for trunk?
>
> This is an odd place to implement such transform.  Also if it
> is faster or not depends on the exact ISA you target - for
> example ppc has constraints on the maximum number of shifts
> carried out in parallel and the above has 4 in very short
> succession.  Esp. for the sign-extend path.

Thank you for the information about ppc. If this is an issue, I think
we can do it in a target dependent way.


>
> So this looks more like an opportunity for a post-vectorizer
> transform on RTL or for the vectorizer special-casing
> widening loads with a vectorizer pattern.

I am not sure if the RTL transform is more difficult to implement. I
prefer the widening loads method, which can be detected in a pattern
recognizer. The target related issue will be resolved by only
expanding the widening load on those targets where this pattern is
beneficial. But this requires new tree operations to be defined. What
is your suggestion?

I apologize for the delayed reply.


thanks,
Cong

>
> Richard.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Detect a pack-unpack pattern in GCC vectorizer and optimize it.
  2014-05-03  0:39   ` Cong Hou
@ 2014-06-24 11:05     ` Richard Biener
  2014-06-25 17:34       ` Cong Hou
  0 siblings, 1 reply; 6+ messages in thread
From: Richard Biener @ 2014-06-24 11:05 UTC (permalink / raw)
  To: Cong Hou; +Cc: Richard Biener, GCC Patches

On Sat, May 3, 2014 at 2:39 AM, Cong Hou <congh@google.com> wrote:
> On Mon, Apr 28, 2014 at 4:04 AM, Richard Biener <rguenther@suse.de> wrote:
>> On Thu, 24 Apr 2014, Cong Hou wrote:
>>
>>> Given the following loop:
>>>
>>> int a[N];
>>> short b[N*2];
>>>
>>> for (int i = 0; i < N; ++i)
>>>   a[i] = b[i*2];
>>>
>>>
>>> After being vectorized, the access to b[i*2] will be compiled into
>>> several packing statements, while the type promotion from short to int
>>> will be compiled into several unpacking statements. With this patch,
>>> each pair of pack/unpack statements will be replaced by less expensive
>>> statements (with shift or bit-and operations).
>>>
>>> On x86_64, the loop above will be compiled into the following assembly
>>> (with -O2 -ftree-vectorize):
>>>
>>> movdqu 0x10(%rcx),%xmm3
>>> movdqu -0x20(%rcx),%xmm0
>>> movdqa %xmm0,%xmm2
>>> punpcklwd %xmm3,%xmm0
>>> punpckhwd %xmm3,%xmm2
>>> movdqa %xmm0,%xmm3
>>> punpcklwd %xmm2,%xmm0
>>> punpckhwd %xmm2,%xmm3
>>> movdqa %xmm1,%xmm2
>>> punpcklwd %xmm3,%xmm0
>>> pcmpgtw %xmm0,%xmm2
>>> movdqa %xmm0,%xmm3
>>> punpckhwd %xmm2,%xmm0
>>> punpcklwd %xmm2,%xmm3
>>> movups %xmm0,-0x10(%rdx)
>>> movups %xmm3,-0x20(%rdx)
>>>
>>>
>>> With this patch, the generated assembly is shown below:
>>>
>>> movdqu 0x10(%rcx),%xmm0
>>> movdqu -0x20(%rcx),%xmm1
>>> pslld  $0x10,%xmm0
>>> psrad  $0x10,%xmm0
>>> pslld  $0x10,%xmm1
>>> movups %xmm0,-0x10(%rdx)
>>> psrad  $0x10,%xmm1
>>> movups %xmm1,-0x20(%rdx)
>>>
>>>
>>> Bootstrapped and tested on x86-64. OK for trunk?
>>
>> This is an odd place to implement such transform.  Also if it
>> is faster or not depends on the exact ISA you target - for
>> example ppc has constraints on the maximum number of shifts
>> carried out in parallel and the above has 4 in very short
>> succession.  Esp. for the sign-extend path.
>
> Thank you for the information about ppc. If this is an issue, I think
> we can do it in a target dependent way.
>
>
>>
>> So this looks more like an opportunity for a post-vectorizer
>> transform on RTL or for the vectorizer special-casing
>> widening loads with a vectorizer pattern.
>
> I am not sure if the RTL transform is more difficult to implement. I
> prefer the widening loads method, which can be detected in a pattern
> recognizer. The target related issue will be resolved by only
> expanding the widening load on those targets where this pattern is
> beneficial. But this requires new tree operations to be defined. What
> is your suggestion?
>
> I apologize for the delayed reply.

Likewise ;)

I suggest to implement this optimization in vector lowering in
tree-vect-generic.c.  This sees for your example

  vect__5.7_32 = MEM[symbol: b, index: ivtmp.15_13, offset: 0B];
  vect__5.8_34 = MEM[symbol: b, index: ivtmp.15_13, offset: 16B];
  vect_perm_even_35 = VEC_PERM_EXPR <vect__5.7_32, vect__5.8_34, { 0,
2, 4, 6, 8, 10, 12, 14 }>;
  vect__6.9_37 = [vec_unpack_lo_expr] vect_perm_even_35;
  vect__6.9_38 = [vec_unpack_hi_expr] vect_perm_even_35;

where you can apply the pattern matching and transform (after checking
with the target, of course).

Richard.

>
> thanks,
> Cong
>
>>
>> Richard.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Detect a pack-unpack pattern in GCC vectorizer and optimize it.
  2014-06-24 11:05     ` Richard Biener
@ 2014-06-25 17:34       ` Cong Hou
  2014-11-21 12:03         ` Evgeny Stupachenko
  0 siblings, 1 reply; 6+ messages in thread
From: Cong Hou @ 2014-06-25 17:34 UTC (permalink / raw)
  To: Richard Biener; +Cc: Richard Biener, GCC Patches

On Tue, Jun 24, 2014 at 4:05 AM, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Sat, May 3, 2014 at 2:39 AM, Cong Hou <congh@google.com> wrote:
>> On Mon, Apr 28, 2014 at 4:04 AM, Richard Biener <rguenther@suse.de> wrote:
>>> On Thu, 24 Apr 2014, Cong Hou wrote:
>>>
>>>> Given the following loop:
>>>>
>>>> int a[N];
>>>> short b[N*2];
>>>>
>>>> for (int i = 0; i < N; ++i)
>>>>   a[i] = b[i*2];
>>>>
>>>>
>>>> After being vectorized, the access to b[i*2] will be compiled into
>>>> several packing statements, while the type promotion from short to int
>>>> will be compiled into several unpacking statements. With this patch,
>>>> each pair of pack/unpack statements will be replaced by less expensive
>>>> statements (with shift or bit-and operations).
>>>>
>>>> On x86_64, the loop above will be compiled into the following assembly
>>>> (with -O2 -ftree-vectorize):
>>>>
>>>> movdqu 0x10(%rcx),%xmm3
>>>> movdqu -0x20(%rcx),%xmm0
>>>> movdqa %xmm0,%xmm2
>>>> punpcklwd %xmm3,%xmm0
>>>> punpckhwd %xmm3,%xmm2
>>>> movdqa %xmm0,%xmm3
>>>> punpcklwd %xmm2,%xmm0
>>>> punpckhwd %xmm2,%xmm3
>>>> movdqa %xmm1,%xmm2
>>>> punpcklwd %xmm3,%xmm0
>>>> pcmpgtw %xmm0,%xmm2
>>>> movdqa %xmm0,%xmm3
>>>> punpckhwd %xmm2,%xmm0
>>>> punpcklwd %xmm2,%xmm3
>>>> movups %xmm0,-0x10(%rdx)
>>>> movups %xmm3,-0x20(%rdx)
>>>>
>>>>
>>>> With this patch, the generated assembly is shown below:
>>>>
>>>> movdqu 0x10(%rcx),%xmm0
>>>> movdqu -0x20(%rcx),%xmm1
>>>> pslld  $0x10,%xmm0
>>>> psrad  $0x10,%xmm0
>>>> pslld  $0x10,%xmm1
>>>> movups %xmm0,-0x10(%rdx)
>>>> psrad  $0x10,%xmm1
>>>> movups %xmm1,-0x20(%rdx)
>>>>
>>>>
>>>> Bootstrapped and tested on x86-64. OK for trunk?
>>>
>>> This is an odd place to implement such transform.  Also if it
>>> is faster or not depends on the exact ISA you target - for
>>> example ppc has constraints on the maximum number of shifts
>>> carried out in parallel and the above has 4 in very short
>>> succession.  Esp. for the sign-extend path.
>>
>> Thank you for the information about ppc. If this is an issue, I think
>> we can do it in a target dependent way.
>>
>>
>>>
>>> So this looks more like an opportunity for a post-vectorizer
>>> transform on RTL or for the vectorizer special-casing
>>> widening loads with a vectorizer pattern.
>>
>> I am not sure if the RTL transform is more difficult to implement. I
>> prefer the widening loads method, which can be detected in a pattern
>> recognizer. The target related issue will be resolved by only
>> expanding the widening load on those targets where this pattern is
>> beneficial. But this requires new tree operations to be defined. What
>> is your suggestion?
>>
>> I apologize for the delayed reply.
>
> Likewise ;)
>
> I suggest to implement this optimization in vector lowering in
> tree-vect-generic.c.  This sees for your example
>
>   vect__5.7_32 = MEM[symbol: b, index: ivtmp.15_13, offset: 0B];
>   vect__5.8_34 = MEM[symbol: b, index: ivtmp.15_13, offset: 16B];
>   vect_perm_even_35 = VEC_PERM_EXPR <vect__5.7_32, vect__5.8_34, { 0,
> 2, 4, 6, 8, 10, 12, 14 }>;
>   vect__6.9_37 = [vec_unpack_lo_expr] vect_perm_even_35;
>   vect__6.9_38 = [vec_unpack_hi_expr] vect_perm_even_35;
>
> where you can apply the pattern matching and transform (after checking
> with the target, of course).

This sounds good to me! I'll try to make a patch following your suggestion.

Thank you!


Cong

>
> Richard.
>
>>
>> thanks,
>> Cong
>>
>>>
>>> Richard.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Detect a pack-unpack pattern in GCC vectorizer and optimize it.
  2014-06-25 17:34       ` Cong Hou
@ 2014-11-21 12:03         ` Evgeny Stupachenko
  0 siblings, 0 replies; 6+ messages in thread
From: Evgeny Stupachenko @ 2014-11-21 12:03 UTC (permalink / raw)
  To: Cong Hou; +Cc: Richard Biener, Richard Biener, GCC Patches

Hi,

Please note that currently the test:

int a[N];
short b[N*2];

for (int i = 0; i < N; ++i)
  a[i] = b[i*2];

Is compiled to (with -march=corei7 -O2 -ftree-vectorize):

        movdqa  b(%rax), %xmm0
        movdqa  b-16(%rax), %xmm2
        pand    %xmm1, %xmm0
        pand    %xmm1, %xmm2
        packusdw        %xmm2, %xmm0
        pmovsxwd        %xmm0, %xmm2
        psrldq  $8, %xmm0
        pmovsxwd        %xmm0, %xmm0
        movaps  %xmm2, a-32(%rax)
        movaps  %xmm0, a-16(%rax)

Which is more close to the requested sequence.

Thanks,
Evgeny


On Wed, Jun 25, 2014 at 8:34 PM, Cong Hou <congh@google.com> wrote:
> On Tue, Jun 24, 2014 at 4:05 AM, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> On Sat, May 3, 2014 at 2:39 AM, Cong Hou <congh@google.com> wrote:
>>> On Mon, Apr 28, 2014 at 4:04 AM, Richard Biener <rguenther@suse.de> wrote:
>>>> On Thu, 24 Apr 2014, Cong Hou wrote:
>>>>
>>>>> Given the following loop:
>>>>>
>>>>> int a[N];
>>>>> short b[N*2];
>>>>>
>>>>> for (int i = 0; i < N; ++i)
>>>>>   a[i] = b[i*2];
>>>>>
>>>>>
>>>>> After being vectorized, the access to b[i*2] will be compiled into
>>>>> several packing statements, while the type promotion from short to int
>>>>> will be compiled into several unpacking statements. With this patch,
>>>>> each pair of pack/unpack statements will be replaced by less expensive
>>>>> statements (with shift or bit-and operations).
>>>>>
>>>>> On x86_64, the loop above will be compiled into the following assembly
>>>>> (with -O2 -ftree-vectorize):
>>>>>
>>>>> movdqu 0x10(%rcx),%xmm3
>>>>> movdqu -0x20(%rcx),%xmm0
>>>>> movdqa %xmm0,%xmm2
>>>>> punpcklwd %xmm3,%xmm0
>>>>> punpckhwd %xmm3,%xmm2
>>>>> movdqa %xmm0,%xmm3
>>>>> punpcklwd %xmm2,%xmm0
>>>>> punpckhwd %xmm2,%xmm3
>>>>> movdqa %xmm1,%xmm2
>>>>> punpcklwd %xmm3,%xmm0
>>>>> pcmpgtw %xmm0,%xmm2
>>>>> movdqa %xmm0,%xmm3
>>>>> punpckhwd %xmm2,%xmm0
>>>>> punpcklwd %xmm2,%xmm3
>>>>> movups %xmm0,-0x10(%rdx)
>>>>> movups %xmm3,-0x20(%rdx)
>>>>>
>>>>>
>>>>> With this patch, the generated assembly is shown below:
>>>>>
>>>>> movdqu 0x10(%rcx),%xmm0
>>>>> movdqu -0x20(%rcx),%xmm1
>>>>> pslld  $0x10,%xmm0
>>>>> psrad  $0x10,%xmm0
>>>>> pslld  $0x10,%xmm1
>>>>> movups %xmm0,-0x10(%rdx)
>>>>> psrad  $0x10,%xmm1
>>>>> movups %xmm1,-0x20(%rdx)
>>>>>
>>>>>
>>>>> Bootstrapped and tested on x86-64. OK for trunk?
>>>>
>>>> This is an odd place to implement such transform.  Also if it
>>>> is faster or not depends on the exact ISA you target - for
>>>> example ppc has constraints on the maximum number of shifts
>>>> carried out in parallel and the above has 4 in very short
>>>> succession.  Esp. for the sign-extend path.
>>>
>>> Thank you for the information about ppc. If this is an issue, I think
>>> we can do it in a target dependent way.
>>>
>>>
>>>>
>>>> So this looks more like an opportunity for a post-vectorizer
>>>> transform on RTL or for the vectorizer special-casing
>>>> widening loads with a vectorizer pattern.
>>>
>>> I am not sure if the RTL transform is more difficult to implement. I
>>> prefer the widening loads method, which can be detected in a pattern
>>> recognizer. The target related issue will be resolved by only
>>> expanding the widening load on those targets where this pattern is
>>> beneficial. But this requires new tree operations to be defined. What
>>> is your suggestion?
>>>
>>> I apologize for the delayed reply.
>>
>> Likewise ;)
>>
>> I suggest to implement this optimization in vector lowering in
>> tree-vect-generic.c.  This sees for your example
>>
>>   vect__5.7_32 = MEM[symbol: b, index: ivtmp.15_13, offset: 0B];
>>   vect__5.8_34 = MEM[symbol: b, index: ivtmp.15_13, offset: 16B];
>>   vect_perm_even_35 = VEC_PERM_EXPR <vect__5.7_32, vect__5.8_34, { 0,
>> 2, 4, 6, 8, 10, 12, 14 }>;
>>   vect__6.9_37 = [vec_unpack_lo_expr] vect_perm_even_35;
>>   vect__6.9_38 = [vec_unpack_hi_expr] vect_perm_even_35;
>>
>> where you can apply the pattern matching and transform (after checking
>> with the target, of course).
>
> This sounds good to me! I'll try to make a patch following your suggestion.
>
> Thank you!
>
>
> Cong
>
>>
>> Richard.
>>
>>>
>>> thanks,
>>> Cong
>>>
>>>>
>>>> Richard.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2014-11-21 11:50 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-04-24 17:27 [PATCH] Detect a pack-unpack pattern in GCC vectorizer and optimize it Cong Hou
2014-04-28 11:17 ` Richard Biener
2014-05-03  0:39   ` Cong Hou
2014-06-24 11:05     ` Richard Biener
2014-06-25 17:34       ` Cong Hou
2014-11-21 12:03         ` Evgeny Stupachenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).