public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: liuhongt <hongtao.liu@intel.com>
To: gcc-patches@gcc.gnu.org
Cc: jakub@rehat.com, crazylht@gmail.com
Subject: [PATCH] [i386] Optimize __builtin_shuffle_vector.
Date: Mon, 16 Aug 2021 13:18:38 +0800	[thread overview]
Message-ID: <20210816051838.868413-1-hongtao.liu@intel.com> (raw)
In-Reply-To: <20210813084753.GG2380545@tucnak>

Hi:
  Here's updated patch which does 3 things:
1. Support vpermw/vpermb in ix86_expand_vec_one_operand_perm_avx512.
2. Support 256/128-bits vpermi2b in ix86_expand_vec_perm_vpermt2.
3. Add define_insn_and_split to optimize specific vector permutation to opmov{dw,wb,qd}.

  Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
  Ok for trunk?

gcc/ChangeLog:

	PR target/101846
	* config/i386/i386-expand.c (ix86_expand_vec_perm_vpermt2):
	Support vpermi2b for V32QI/V16QImode.
	(ix86_extract_perm_from_pool_constant): New function.
	(ix86_expand_vec_one_operand_perm_avx512): Support
	vpermw/vpermb under TARGET_AVX512BW/TARGET_AVX512VBMI.
	(expand_vec_perm_1): Adjust comments for upper.
	* config/i386/i386-protos.h (ix86_extract_perm_from_pool_constant):
	New declare.
	* config/i386/predicates.md (permvar_truncate_operand): New predicate.
	(pshufb_truncv4siv4hi_operand): Ditto.
	(pshufb_truncv8hiv8qi_operand): Ditto.
	* config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1):
	New pre_reload define_insn_and_split.
	(*avx512f_permvar_truncv8siv8hi_1): Ditto.
	(*avx512f_vpermvar_truncv8div8si_1): Ditto.
	(*avx512f_permvar_truncv32hiv32qi_1): Ditto.
	(*avx512f_permvar_truncv16hiv16qi_1): Ditto.
	(*avx512f_permvar_truncv4div4si_1): Ditto.
	(*avx512f_pshufb_truncv8hiv8qi_1): Ditto.
	(*avx512f_pshufb_truncv4siv4hi_1): Ditto.
	(*avx512f_pshufd_truncv2div2si_1): Ditto.

gcc/testsuite/ChangeLog:

	PR target/101846
	* gcc.target/i386/pr101846-2.c: New test.
	* gcc.target/i386/pr101846-3.c: New test.
	* gcc.target/i386/pr101846-4.c: New test.
---
 gcc/config/i386/i386-expand.c              |  89 +++++++++-
 gcc/config/i386/i386-protos.h              |   1 +
 gcc/config/i386/predicates.md              |  90 ++++++++++
 gcc/config/i386/sse.md                     | 190 +++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101846-2.c |  81 +++++++++
 gcc/testsuite/gcc.target/i386/pr101846-3.c |  73 ++++++++
 gcc/testsuite/gcc.target/i386/pr101846-4.c |  40 +++++
 7 files changed, 559 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-4.c

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index a652b25f534..56319cb6f6a 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -4778,6 +4778,18 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
 
   switch (mode)
     {
+    case E_V16QImode:
+      if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+	gen = gen_avx512vl_vpermt2varv16qi3;
+      break;
+    case E_V32QImode:
+      if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+	gen = gen_avx512vl_vpermt2varv32qi3;
+      break;
+    case E_V64QImode:
+      if (TARGET_AVX512VBMI)
+	gen = gen_avx512bw_vpermt2varv64qi3;
+      break;
     case E_V8HImode:
       if (TARGET_AVX512VL && TARGET_AVX512BW)
 	gen = gen_avx512vl_vpermt2varv8hi3;
@@ -4786,10 +4798,6 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
       if (TARGET_AVX512VL && TARGET_AVX512BW)
 	gen = gen_avx512vl_vpermt2varv16hi3;
       break;
-    case E_V64QImode:
-      if (TARGET_AVX512VBMI)
-	gen = gen_avx512bw_vpermt2varv64qi3;
-      break;
     case E_V32HImode:
       if (TARGET_AVX512BW)
 	gen = gen_avx512bw_vpermt2varv32hi3;
@@ -5487,6 +5495,45 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
     }
 }
 
+/* Return true if mem is pool constant which contains a const_vector
+   perm index, assign the index to PERM.  */
+bool
+ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
+{
+  machine_mode mode = GET_MODE (mem);
+  int nelt = GET_MODE_NUNITS (mode);
+
+  if (!INTEGRAL_MODE_P (mode))
+    return false;
+
+    /* Needs to be constant pool.  */
+  if (!(MEM_P (mem))
+      || !SYMBOL_REF_P (XEXP (mem, 0))
+      || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
+   return false;
+
+  rtx constant = get_pool_constant (XEXP (mem, 0));
+
+  if (GET_CODE (constant) != CONST_VECTOR)
+    return false;
+
+  /* There could be some rtx like
+     (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+     but with "*.LC1" refer to V2DI constant vector.  */
+  if (GET_MODE (constant) != mode)
+    {
+      constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
+
+      if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+	return false;
+    }
+
+  for (int i = 0; i != nelt; i++)
+    perm[i] = UINTVAL (XVECEXP (constant, 0, i));
+
+  return true;
+}
+
 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
    but works for floating pointer parameters and nonoffsetable memories.
    For pushes, it returns just stack offsets; the values will be saved
@@ -18086,6 +18133,7 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
 {
   machine_mode mode = GET_MODE (d->op0);
   machine_mode maskmode = mode;
+  unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
   rtx (*gen) (rtx, rtx, rtx) = NULL;
   rtx target, op0, mask;
   rtx vec[64];
@@ -18096,6 +18144,18 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
   if (!TARGET_AVX512F)
     return false;
 
+  /* Accept VNxHImode and VNxQImode now.  */
+  if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
+    return false;
+
+  /* vpermw.  */
+  if (!TARGET_AVX512BW && inner_size == 2)
+    return false;
+
+  /* vpermb.   */
+  if (!TARGET_AVX512VBMI && inner_size == 1)
+    return false;
+
   switch (mode)
     {
     case E_V16SImode:
@@ -18112,6 +18172,25 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
       gen = gen_avx512f_permvarv8df;
       maskmode = V8DImode;
       break;
+    case E_V32HImode:
+      gen = gen_avx512bw_permvarv32hi;
+      break;
+    case E_V16HImode:
+      gen = gen_avx512vl_permvarv16hi;
+      break;
+    case E_V8HImode:
+      gen = gen_avx512vl_permvarv8hi;
+      break;
+    case E_V64QImode:
+      gen = gen_avx512bw_permvarv64qi;
+      break;
+    case E_V32QImode:
+      gen = gen_avx512vl_permvarv32qi;
+      break;
+    case E_V16QImode:
+      gen = gen_avx512vl_permvarv16qi;
+      break;
+
     default:
       return false;
     }
@@ -18301,7 +18380,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_palignr (d, true))
     return true;
 
-  /* Try the AVX512F vperm{s,d} instructions.  */
+  /* Try the AVX512F vperm{w,b,s,d} and instructions  */
   if (ix86_expand_vec_one_operand_perm_avx512 (d))
     return true;
 
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 07ac02aff69..2fd13074c81 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -260,6 +260,7 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
 extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
 					       rtx);
+extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
 
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 129205ac3a7..650d6354de9 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1713,6 +1713,96 @@ (define_predicate "addsub_vs_parallel"
   return true;
 })
 
+;; Return true if OP is a constant pool in perm{w,d,b} which constains index
+;; match pmov{dw,wb,qd}.
+(define_predicate "permvar_truncate_operand"
+ (match_code "mem")
+{
+  int nelt = GET_MODE_NUNITS (mode);
+  int perm[128];
+  int id;
+
+  if (!INTEGRAL_MODE_P (mode) || !VECTOR_MODE_P (mode))
+    return false;
+
+  if (nelt < 2)
+    return false;
+
+  if (!ix86_extract_perm_from_pool_constant (&perm[0], op))
+    return false;
+
+  id = exact_log2 (nelt);
+
+  /* Check that the permutation is suitable for pmovz{bw,wd,dq}.
+     For example V16HImode to V8HImode
+     { 0 2 4 6 8 10 12 14 * * * * * * * * }.  */
+  for (int i = 0; i != nelt/2; i++)
+    if ((perm[i] & ((1 << id) - 1)) != i * 2)
+      return false;
+
+  return true;
+})
+
+;; Return true if OP is a constant pool in shufb which constains index
+;; match pmovdw.
+(define_predicate "pshufb_truncv4siv4hi_operand"
+ (match_code "mem")
+{
+  int perm[128];
+
+  if (mode != E_V16QImode)
+    return false;
+
+  if (!ix86_extract_perm_from_pool_constant (&perm[0], op))
+    return false;
+
+  /* Check that the permutation is suitable for pmovwd.
+     For example V16HImode to V8HImode
+     { 0 1 4 5 8 9 12 13 * * * * * * * * }.
+     index = i % 2 + (i / 2) * 4.  */
+  for (int i = 0; i != 8; i++)
+    {
+      /* if (SRC2[(i * 8)+7] = 1) then DEST[(i*8)+7..(i*8)+0] := 0;  */
+      if (perm[i] & 128)
+	return false;
+
+      if ((perm[i] & 15) != ((i & 1) + (i & 0xFE) * 2))
+	return false;
+     }
+
+  return true;
+})
+
+;; Return true if OP is a constant pool in shufb which constains index
+;; match pmovdw.
+(define_predicate "pshufb_truncv8hiv8qi_operand"
+ (match_code "mem")
+{
+  int perm[128];
+
+  if (mode != E_V16QImode)
+    return false;
+
+  if (!ix86_extract_perm_from_pool_constant (&perm[0], op))
+    return false;
+
+  /* Check that the permutation is suitable for pmovwd.
+     For example V16HImode to V8HImode
+     { 0 2 4 6 8 10 12 14 * * * * * * * * }.
+     index = i % 2 + (i / 2) * 4.  */
+  for (int i = 0; i != 8; i++)
+    {
+      /* if (SRC2[(i * 8)+7] = 1) then DEST[(i*8)+7..(i*8)+0] := 0;  */
+      if (perm[i] & 128)
+	return false;
+
+      if ((perm[i] & 15) != i * 2)
+	 return false;
+    }
+
+  return true;
+})
+
 ;; Return true if OP is a parallel for an pmovz{bw,wd,dq} vec_select,
 ;; where one of the two operands of the vec_concat is const0_operand.
 (define_predicate "pmovzx_parallel"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3a7bbaec7af..c9f21082beb 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -10978,6 +10978,64 @@ (define_insn "*avx512f_<code><pmov_src_lower><mode>2"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1"
+  [(set (match_operand:V16HI 0 "nonimmediate_operand")
+	(vec_select:V16HI
+	  (unspec:V32HI
+	    [(match_operand:V32HI 1 "register_operand")
+	     (match_operand:V32HI 2 "permvar_truncate_operand")]
+	   UNSPEC_VPERMVAR)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V16HI (match_dup 1)))]
+  "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")
+
+(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"
+  [(set (match_operand:V8HI 0 "nonimmediate_operand")
+	(vec_select:V8HI
+	  (unspec:V16HI
+	    [(match_operand:V16HI 1 "register_operand")
+	     (match_operand:V16HI 2 "permvar_truncate_operand")]
+	   UNSPEC_VPERMVAR)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))]
+  "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V8HI (match_dup 1)))]
+  "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")
+
+(define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"
+  [(set (match_operand:V8SI 0 "nonimmediate_operand")
+	(vec_select:V8SI
+	  (unspec:V16SI
+	    [(match_operand:V16SI 1 "register_operand")
+	     (match_operand:V16SI 2 "permvar_truncate_operand")]
+	   UNSPEC_VPERMVAR)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))]
+  "TARGET_AVX512F && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V8SI (match_dup 1)))]
+  "operands[1] = lowpart_subreg (V8DImode, operands[1], V16SImode);")
+
 (define_insn "avx512f_<code><pmov_src_lower><mode>2_mask"
   [(set (match_operand:PMOV_DST_MODE_1 0 "nonimmediate_operand" "=v,m")
     (vec_merge:PMOV_DST_MODE_1
@@ -11018,6 +11076,36 @@ (define_insn "avx512bw_<code>v32hiv32qi2"
    (set_attr "prefix" "evex")
    (set_attr "mode" "XI")])
 
+(define_insn_and_split "*avx512f_permvar_truncv32hiv32qi_1"
+  [(set (match_operand:V32QI 0 "nonimmediate_operand")
+	(vec_select:V32QI
+	  (unspec:V64QI
+	    [(match_operand:V64QI 1 "register_operand")
+	     (match_operand:V64QI 2 "permvar_truncate_operand")]
+	   UNSPEC_VPERMVAR)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)
+		     (const_int 16) (const_int 17)
+		     (const_int 18) (const_int 19)
+		     (const_int 20) (const_int 21)
+		     (const_int 22) (const_int 23)
+		     (const_int 24) (const_int 25)
+		     (const_int 26) (const_int 27)
+		     (const_int 28) (const_int 29)
+		     (const_int 30) (const_int 31)])))]
+  "TARGET_AVX512VBMI && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V32QI (match_dup 1)))]
+  "operands[1] = lowpart_subreg (V32HImode, operands[1], V64QImode);")
+
 (define_insn "avx512bw_<code>v32hiv32qi2_mask"
   [(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m")
     (vec_merge:V32QI
@@ -11063,6 +11151,45 @@ (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn_and_split "*avx512f_permvar_truncv16hiv16qi_1"
+  [(set (match_operand:V16QI 0 "nonimmediate_operand")
+	(vec_select:V16QI
+	  (unspec:V32QI
+	    [(match_operand:V32QI 1 "register_operand")
+	     (match_operand:V32QI 2 "permvar_truncate_operand")]
+	   UNSPEC_VPERMVAR)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX512VL && TARGET_AVX512VBMI
+   && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V16QI (match_dup 1)))]
+  "operands[1] = lowpart_subreg (V16HImode, operands[1], V32QImode);")
+
+(define_insn_and_split "*avx512f_permvar_truncv4div4si_1"
+  [(set (match_operand:V4SI 0 "nonimmediate_operand")
+	(vec_select:V4SI
+	  (unspec:V8SI
+	    [(match_operand:V8SI 1 "register_operand")
+	     (match_operand:V8SI 2 "permvar_truncate_operand")]
+	   UNSPEC_VPERMVAR)
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)])))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(set (match_dup 0)
+	(truncate:V4SI (match_dup 1)))]
+  "operands[1] = lowpart_subreg (V4DImode, operands[1], V8SImode);")
+
 (define_insn "<avx512>_<code><ssedoublemodelower><mode>2_mask"
   [(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m")
     (vec_merge:PMOV_DST_MODE_2
@@ -11121,6 +11248,27 @@ (define_insn "avx512vl_<code><mode>v<ssescalarnum>qi2"
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
+(define_insn_and_split "*avx512f_pshufb_truncv8hiv8qi_1"
+  [(set (match_operand:DI 0 "register_operand")
+	(vec_select:DI
+	  (subreg:V2DI
+	    (unspec:V16QI
+	      [(match_operand:V16QI 1 "register_operand")
+	       (match_operand:V16QI 2 "pshufb_truncv8hiv8qi_operand")]
+	   UNSPEC_PSHUFB) 0)
+	  (parallel [(const_int 0)])))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx op1 = gen_reg_rtx (V8QImode);
+  operands[1] = lowpart_subreg (V8HImode, operands[1], V16QImode);
+  emit_insn (gen_truncv8hiv8qi2 (op1, operands[1]));
+  emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V8QImode));
+  DONE;
+})
+
 (define_insn "*avx512vl_<code>v2div2qi2_store_1"
   [(set (match_operand:V2QI 0 "memory_operand" "=m")
 	(any_truncate:V2QI
@@ -11476,6 +11624,27 @@ (define_insn "avx512vl_<code><mode>v<ssescalarnum>hi2"
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
+(define_insn_and_split "*avx512f_pshufb_truncv4siv4hi_1"
+  [(set (match_operand:DI 0 "register_operand")
+	(vec_select:DI
+	  (subreg:V2DI
+	    (unspec:V16QI
+	      [(match_operand:V16QI 1 "register_operand")
+	       (match_operand:V16QI 2 "pshufb_truncv4siv4hi_operand")]
+	   UNSPEC_PSHUFB) 0)
+	  (parallel [(const_int 0)])))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx op1 = gen_reg_rtx (V4HImode);
+  operands[1] = lowpart_subreg (V4SImode, operands[1], V16QImode);
+  emit_insn (gen_truncv4siv4hi2 (op1, operands[1]));
+  emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V4HImode));
+  DONE;
+})
+
 (define_insn "*avx512vl_<code><mode>v4hi2_store_1"
   [(set (match_operand:V4HI 0 "memory_operand" "=m")
 	(any_truncate:V4HI
@@ -11699,6 +11868,27 @@ (define_insn "avx512vl_<code>v2div2si2"
    (set_attr "prefix" "evex")
    (set_attr "mode" "TI")])
 
+(define_insn_and_split "*avx512f_pshufd_truncv2div2si_1"
+  [(set (match_operand:DI 0 "register_operand")
+	(vec_select:DI
+	  (subreg:V2DI
+	    (vec_select:V4SI
+	      (match_operand:V4SI 1 "register_operand")
+	      (parallel [(const_int 0) (const_int 2)
+			 (const_int 2) (const_int 3)])) 0)
+	  (parallel [(const_int 0)])))]
+  "TARGET_AVX512VL && ix86_pre_reload_split ()"
+  "#"
+  "&& 1"
+  [(const_int 0)]
+{
+  rtx op1 = gen_reg_rtx (V2SImode);
+  operands[1] = lowpart_subreg (V2DImode, operands[1], V4SImode);
+  emit_insn (gen_truncv2div2si2 (op1, operands[1]));
+  emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V2SImode));
+  DONE;
+})
+
 (define_insn "*avx512vl_<code>v2div2si2_store_1"
   [(set (match_operand:V2SI 0 "memory_operand" "=m")
 	(any_truncate:V2SI
diff --git a/gcc/testsuite/gcc.target/i386/pr101846-2.c b/gcc/testsuite/gcc.target/i386/pr101846-2.c
new file mode 100644
index 00000000000..26c9ed511e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101846-2.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times "vpmovwb" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovqd" "3" } } */
+
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef char v8qi __attribute__((vector_size (8)));
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+typedef int v2si __attribute__((vector_size (8)));
+typedef int v4si __attribute__((vector_size (16)));
+typedef int v8si __attribute__((vector_size (32)));
+typedef int v16si __attribute__((vector_size (64)));
+
+v16hi
+foo_dw_512 (v32hi x)
+{
+  return __builtin_shufflevector (x, x,
+				  0, 2, 4, 6, 8, 10, 12, 14,
+				  16, 18, 20, 22, 24, 26, 28, 30);
+}
+
+v8hi
+foo_dw_256 (v16hi x)
+{
+  return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14);
+}
+
+v4hi
+foo_dw_128 (v8hi x)
+{
+  return __builtin_shufflevector (x, x, 0, 2, 4, 6);
+}
+
+v8si
+foo_qd_512 (v16si x)
+{
+  return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14);
+}
+
+v4si
+foo_qd_256 (v8si x)
+{
+  return __builtin_shufflevector (x, x, 0, 2, 4, 6);
+}
+
+v2si
+foo_qd_128 (v4si x)
+{
+  return __builtin_shufflevector (x, x, 0, 2);
+}
+
+v32qi
+foo_wb_512 (v64qi x)
+{
+  return __builtin_shufflevector (x, x,
+				  0, 2, 4, 6, 8, 10, 12, 14,
+				  16, 18, 20, 22, 24, 26, 28, 30,
+				  32, 34, 36, 38, 40, 42, 44, 46,
+				  48, 50, 52, 54, 56, 58, 60, 62);
+}
+
+v16qi
+foo_wb_256 (v32qi x)
+{
+  return __builtin_shufflevector (x, x,
+				  0, 2, 4, 6, 8, 10, 12, 14,
+				  16, 18, 20, 22, 24, 26, 28, 30);
+}
+
+v8qi
+foo_wb_128 (v16qi x)
+{
+  return __builtin_shufflevector (x, x,
+				  0, 2, 4, 6, 8, 10, 12, 14);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101846-3.c b/gcc/testsuite/gcc.target/i386/pr101846-3.c
new file mode 100644
index 00000000000..f774018a382
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101846-3.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times "vpermb" "2" } } */
+/* { dg-final { scan-assembler-times "vpermw" "2" } } */
+/* { dg-final { scan-assembler-times "vpermd" "2" } } */
+
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef char v8qi __attribute__((vector_size (8)));
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+typedef int v2si __attribute__((vector_size (8)));
+typedef int v4si __attribute__((vector_size (16)));
+typedef int v8si __attribute__((vector_size (32)));
+typedef int v16si __attribute__((vector_size (64)));
+
+v32hi
+foow_512 (v32hi x)
+{
+  return __builtin_shufflevector (x, x,
+				  0, 2, 4, 6, 8, 10, 12, 14,
+				  16, 18, 20, 22, 24, 26, 28, 30,
+				  16, 17, 18, 19, 20, 21, 22, 23,
+				  24, 25, 26, 27, 28, 29, 30, 31);
+}
+
+v16hi
+foow_256 (v16hi x)
+{
+  return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14,
+				  8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+
+v16si
+food_512 (v16si x)
+{
+  return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14,
+				  8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+v8si
+food_256 (v8si x)
+{
+  return __builtin_shufflevector (x, x, 0, 2, 4, 6, 4, 5, 6, 7);
+}
+
+v64qi
+foob_512 (v64qi x)
+{
+  return __builtin_shufflevector (x, x,
+				  0, 2, 4, 6, 8, 10, 12, 14,
+				  16, 18, 20, 22, 24, 26, 28, 30,
+				  32, 34, 36, 38, 40, 42, 44, 46,
+				  48, 50, 52, 54, 56, 58, 60, 62,
+				  32, 33, 34, 35, 36, 37, 38, 39,
+				  40, 41, 42, 43, 44, 45, 46, 47,
+				  48, 49, 50, 51, 52, 53, 54, 55,
+				  56, 57, 58, 59, 60, 61, 62, 63);
+}
+
+v32qi
+foob_256 (v32qi x)
+{
+  return __builtin_shufflevector (x, x,
+				  0, 2, 4, 6, 8, 10, 12, 14,
+				  16, 18, 20, 22, 24, 26, 28, 30,
+				  16, 17, 18, 19, 20, 21, 22, 23,
+				  24, 25, 26, 27, 28, 29, 30, 31);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101846-4.c b/gcc/testsuite/gcc.target/i386/pr101846-4.c
new file mode 100644
index 00000000000..2a6163c4d72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101846-4.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times "vpermi2b" "3" } } */
+
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+
+
+v64qi
+foob_512 (v64qi x, v64qi y)
+{
+  return __builtin_shufflevector (x, y,
+				  0, 2, 4, 6, 8, 10, 12, 14,
+				  16, 18, 20, 22, 24, 26, 28, 30,
+				  32, 34, 36, 38, 40, 42, 44, 46,
+				  48, 50, 52, 54, 56, 58, 60, 62,
+				  64, 65, 66, 67, 68, 69, 70, 71,
+				  72, 73, 74, 77, 79, 74, 72, 70,
+				  89, 88, 78, 86, 85, 75, 83, 82,
+				  112, 108, 101, 100, 86, 96, 97, 95);
+}
+
+v32qi
+foob_256 (v32qi x, v32qi y)
+{
+  return __builtin_shufflevector (x, y,
+				  0, 2, 4, 6, 8, 10, 12, 14,
+				  16, 18, 20, 22, 24, 26, 28, 30,
+				  32, 34, 36, 38, 40, 42, 44, 46,
+				  48, 50, 52, 54, 56, 58, 60, 62);
+}
+
+v16qi
+foob_128 (v16qi x, v16qi y)
+{
+  return __builtin_shufflevector (x, y,
+				  0, 2, 4, 6, 8, 10, 12, 14,
+				  16, 18, 20, 22, 24, 26, 28, 30);
+}
-- 
2.27.0


  parent reply	other threads:[~2021-08-16  5:18 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-11  6:43 [PATCH] [i386] Combine avx_vec_concatv16si and avx512f_zero_extendv16hiv16si2_1 to avx512f_zero_extendv16hiv16si2_2 liuhongt
2021-08-11  7:58 ` Jakub Jelinek
2021-08-11  9:32   ` Hongtao Liu
2021-08-12  5:43     ` [PATCH] [i386] Optimize vec_perm_expr to match vpmov{dw,qd,wb} liuhongt
2021-08-12  9:22       ` Jakub Jelinek
2021-08-12  9:41         ` Jakub Jelinek
2021-08-13  1:42         ` Hongtao Liu
2021-08-13  8:47           ` Jakub Jelinek
2021-08-13  9:03             ` Richard Sandiford
2021-08-16  7:19               ` Richard Biener
2021-08-16  5:18             ` liuhongt [this message]
2021-08-16  7:10               ` [PATCH] [i386] Optimize __builtin_shuffle_vector Jakub Jelinek
2021-08-16  7:25                 ` Hongtao Liu
2021-08-17  1:45                   ` Hongtao Liu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210816051838.868413-1-hongtao.liu@intel.com \
    --to=hongtao.liu@intel.com \
    --cc=crazylht@gmail.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=jakub@rehat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).