From: liuhongt <hongtao.liu@intel.com>
To: gcc-patches@gcc.gnu.org
Cc: jakub@rehat.com, crazylht@gmail.com
Subject: [PATCH] [i386] Optimize __builtin_shuffle_vector.
Date: Mon, 16 Aug 2021 13:18:38 +0800 [thread overview]
Message-ID: <20210816051838.868413-1-hongtao.liu@intel.com> (raw)
In-Reply-To: <20210813084753.GG2380545@tucnak>
Hi:
Here's updated patch which does 3 things:
1. Support vpermw/vpermb in ix86_expand_vec_one_operand_perm_avx512.
2. Support 256/128-bits vpermi2b in ix86_expand_vec_perm_vpermt2.
3. Add define_insn_and_split to optimize specific vector permutation to opmov{dw,wb,qd}.
Bootstrapped and regtested on x86_64-linux-gnu{-m32,}.
Ok for trunk?
gcc/ChangeLog:
PR target/101846
* config/i386/i386-expand.c (ix86_expand_vec_perm_vpermt2):
Support vpermi2b for V32QI/V16QImode.
(ix86_extract_perm_from_pool_constant): New function.
(ix86_expand_vec_one_operand_perm_avx512): Support
vpermw/vpermb under TARGET_AVX512BW/TARGET_AVX512VBMI.
(expand_vec_perm_1): Adjust comments for upper.
* config/i386/i386-protos.h (ix86_extract_perm_from_pool_constant):
New declare.
* config/i386/predicates.md (permvar_truncate_operand): New predicate.
(pshufb_truncv4siv4hi_operand): Ditto.
(pshufb_truncv8hiv8qi_operand): Ditto.
* config/i386/sse.md (*avx512bw_permvar_truncv16siv16hi_1):
New pre_reload define_insn_and_split.
(*avx512f_permvar_truncv8siv8hi_1): Ditto.
(*avx512f_vpermvar_truncv8div8si_1): Ditto.
(*avx512f_permvar_truncv32hiv32qi_1): Ditto.
(*avx512f_permvar_truncv16hiv16qi_1): Ditto.
(*avx512f_permvar_truncv4div4si_1): Ditto.
(*avx512f_pshufb_truncv8hiv8qi_1): Ditto.
(*avx512f_pshufb_truncv4siv4hi_1): Ditto.
(*avx512f_pshufd_truncv2div2si_1): Ditto.
gcc/testsuite/ChangeLog:
PR target/101846
* gcc.target/i386/pr101846-2.c: New test.
* gcc.target/i386/pr101846-3.c: New test.
* gcc.target/i386/pr101846-4.c: New test.
---
gcc/config/i386/i386-expand.c | 89 +++++++++-
gcc/config/i386/i386-protos.h | 1 +
gcc/config/i386/predicates.md | 90 ++++++++++
gcc/config/i386/sse.md | 190 +++++++++++++++++++++
gcc/testsuite/gcc.target/i386/pr101846-2.c | 81 +++++++++
gcc/testsuite/gcc.target/i386/pr101846-3.c | 73 ++++++++
gcc/testsuite/gcc.target/i386/pr101846-4.c | 40 +++++
7 files changed, 559 insertions(+), 5 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-2.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-3.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr101846-4.c
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index a652b25f534..56319cb6f6a 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -4778,6 +4778,18 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
switch (mode)
{
+ case E_V16QImode:
+ if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+ gen = gen_avx512vl_vpermt2varv16qi3;
+ break;
+ case E_V32QImode:
+ if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+ gen = gen_avx512vl_vpermt2varv32qi3;
+ break;
+ case E_V64QImode:
+ if (TARGET_AVX512VBMI)
+ gen = gen_avx512bw_vpermt2varv64qi3;
+ break;
case E_V8HImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_vpermt2varv8hi3;
@@ -4786,10 +4798,6 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_vpermt2varv16hi3;
break;
- case E_V64QImode:
- if (TARGET_AVX512VBMI)
- gen = gen_avx512bw_vpermt2varv64qi3;
- break;
case E_V32HImode:
if (TARGET_AVX512BW)
gen = gen_avx512bw_vpermt2varv32hi3;
@@ -5487,6 +5495,45 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
}
}
+/* Return true if mem is pool constant which contains a const_vector
+ perm index, assign the index to PERM. */
+bool
+ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
+{
+ machine_mode mode = GET_MODE (mem);
+ int nelt = GET_MODE_NUNITS (mode);
+
+ if (!INTEGRAL_MODE_P (mode))
+ return false;
+
+ /* Needs to be constant pool. */
+ if (!(MEM_P (mem))
+ || !SYMBOL_REF_P (XEXP (mem, 0))
+ || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
+ return false;
+
+ rtx constant = get_pool_constant (XEXP (mem, 0));
+
+ if (GET_CODE (constant) != CONST_VECTOR)
+ return false;
+
+ /* There could be some rtx like
+ (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+ but with "*.LC1" refer to V2DI constant vector. */
+ if (GET_MODE (constant) != mode)
+ {
+ constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
+
+ if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+ return false;
+ }
+
+ for (int i = 0; i != nelt; i++)
+ perm[i] = UINTVAL (XVECEXP (constant, 0, i));
+
+ return true;
+}
+
/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
but works for floating pointer parameters and nonoffsetable memories.
For pushes, it returns just stack offsets; the values will be saved
@@ -18086,6 +18133,7 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
{
machine_mode mode = GET_MODE (d->op0);
machine_mode maskmode = mode;
+ unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
rtx (*gen) (rtx, rtx, rtx) = NULL;
rtx target, op0, mask;
rtx vec[64];
@@ -18096,6 +18144,18 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
if (!TARGET_AVX512F)
return false;
+ /* Accept VNxHImode and VNxQImode now. */
+ if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
+ return false;
+
+ /* vpermw. */
+ if (!TARGET_AVX512BW && inner_size == 2)
+ return false;
+
+ /* vpermb. */
+ if (!TARGET_AVX512VBMI && inner_size == 1)
+ return false;
+
switch (mode)
{
case E_V16SImode:
@@ -18112,6 +18172,25 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
gen = gen_avx512f_permvarv8df;
maskmode = V8DImode;
break;
+ case E_V32HImode:
+ gen = gen_avx512bw_permvarv32hi;
+ break;
+ case E_V16HImode:
+ gen = gen_avx512vl_permvarv16hi;
+ break;
+ case E_V8HImode:
+ gen = gen_avx512vl_permvarv8hi;
+ break;
+ case E_V64QImode:
+ gen = gen_avx512bw_permvarv64qi;
+ break;
+ case E_V32QImode:
+ gen = gen_avx512vl_permvarv32qi;
+ break;
+ case E_V16QImode:
+ gen = gen_avx512vl_permvarv16qi;
+ break;
+
default:
return false;
}
@@ -18301,7 +18380,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_palignr (d, true))
return true;
- /* Try the AVX512F vperm{s,d} instructions. */
+ /* Try the AVX512F vperm{w,b,s,d} and instructions */
if (ix86_expand_vec_one_operand_perm_avx512 (d))
return true;
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 07ac02aff69..2fd13074c81 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -260,6 +260,7 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
extern void ix86_expand_sse2_abs (rtx, rtx);
extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
rtx);
+extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
/* In i386-c.c */
extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index 129205ac3a7..650d6354de9 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -1713,6 +1713,96 @@ (define_predicate "addsub_vs_parallel"
return true;
})
+;; Return true if OP is a constant pool in perm{w,d,b} which constains index
+;; match pmov{dw,wb,qd}.
+(define_predicate "permvar_truncate_operand"
+ (match_code "mem")
+{
+ int nelt = GET_MODE_NUNITS (mode);
+ int perm[128];
+ int id;
+
+ if (!INTEGRAL_MODE_P (mode) || !VECTOR_MODE_P (mode))
+ return false;
+
+ if (nelt < 2)
+ return false;
+
+ if (!ix86_extract_perm_from_pool_constant (&perm[0], op))
+ return false;
+
+ id = exact_log2 (nelt);
+
+ /* Check that the permutation is suitable for pmovz{bw,wd,dq}.
+ For example V16HImode to V8HImode
+ { 0 2 4 6 8 10 12 14 * * * * * * * * }. */
+ for (int i = 0; i != nelt/2; i++)
+ if ((perm[i] & ((1 << id) - 1)) != i * 2)
+ return false;
+
+ return true;
+})
+
+;; Return true if OP is a constant pool in shufb which constains index
+;; match pmovdw.
+(define_predicate "pshufb_truncv4siv4hi_operand"
+ (match_code "mem")
+{
+ int perm[128];
+
+ if (mode != E_V16QImode)
+ return false;
+
+ if (!ix86_extract_perm_from_pool_constant (&perm[0], op))
+ return false;
+
+ /* Check that the permutation is suitable for pmovwd.
+ For example V16HImode to V8HImode
+ { 0 1 4 5 8 9 12 13 * * * * * * * * }.
+ index = i % 2 + (i / 2) * 4. */
+ for (int i = 0; i != 8; i++)
+ {
+ /* if (SRC2[(i * 8)+7] = 1) then DEST[(i*8)+7..(i*8)+0] := 0; */
+ if (perm[i] & 128)
+ return false;
+
+ if ((perm[i] & 15) != ((i & 1) + (i & 0xFE) * 2))
+ return false;
+ }
+
+ return true;
+})
+
+;; Return true if OP is a constant pool in shufb which constains index
+;; match pmovdw.
+(define_predicate "pshufb_truncv8hiv8qi_operand"
+ (match_code "mem")
+{
+ int perm[128];
+
+ if (mode != E_V16QImode)
+ return false;
+
+ if (!ix86_extract_perm_from_pool_constant (&perm[0], op))
+ return false;
+
+ /* Check that the permutation is suitable for pmovwd.
+ For example V16HImode to V8HImode
+ { 0 2 4 6 8 10 12 14 * * * * * * * * }.
+ index = i % 2 + (i / 2) * 4. */
+ for (int i = 0; i != 8; i++)
+ {
+ /* if (SRC2[(i * 8)+7] = 1) then DEST[(i*8)+7..(i*8)+0] := 0; */
+ if (perm[i] & 128)
+ return false;
+
+ if ((perm[i] & 15) != i * 2)
+ return false;
+ }
+
+ return true;
+})
+
;; Return true if OP is a parallel for an pmovz{bw,wd,dq} vec_select,
;; where one of the two operands of the vec_concat is const0_operand.
(define_predicate "pmovzx_parallel"
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 3a7bbaec7af..c9f21082beb 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -10978,6 +10978,64 @@ (define_insn "*avx512f_<code><pmov_src_lower><mode>2"
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*avx512bw_permvar_truncv16siv16hi_1"
+ [(set (match_operand:V16HI 0 "nonimmediate_operand")
+ (vec_select:V16HI
+ (unspec:V32HI
+ [(match_operand:V32HI 1 "register_operand")
+ (match_operand:V32HI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)
+ (const_int 8) (const_int 9)
+ (const_int 10) (const_int 11)
+ (const_int 12) (const_int 13)
+ (const_int 14) (const_int 15)])))]
+ "TARGET_AVX512BW && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V16HI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V16SImode, operands[1], V32HImode);")
+
+(define_insn_and_split "*avx512f_permvar_truncv8siv8hi_1"
+ [(set (match_operand:V8HI 0 "nonimmediate_operand")
+ (vec_select:V8HI
+ (unspec:V16HI
+ [(match_operand:V16HI 1 "register_operand")
+ (match_operand:V16HI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)])))]
+ "TARGET_AVX512VL && TARGET_AVX512BW && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V8HI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V8SImode, operands[1], V16HImode);")
+
+(define_insn_and_split "*avx512f_vpermvar_truncv8div8si_1"
+ [(set (match_operand:V8SI 0 "nonimmediate_operand")
+ (vec_select:V8SI
+ (unspec:V16SI
+ [(match_operand:V16SI 1 "register_operand")
+ (match_operand:V16SI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)])))]
+ "TARGET_AVX512F && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V8SI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V8DImode, operands[1], V16SImode);")
+
(define_insn "avx512f_<code><pmov_src_lower><mode>2_mask"
[(set (match_operand:PMOV_DST_MODE_1 0 "nonimmediate_operand" "=v,m")
(vec_merge:PMOV_DST_MODE_1
@@ -11018,6 +11076,36 @@ (define_insn "avx512bw_<code>v32hiv32qi2"
(set_attr "prefix" "evex")
(set_attr "mode" "XI")])
+(define_insn_and_split "*avx512f_permvar_truncv32hiv32qi_1"
+ [(set (match_operand:V32QI 0 "nonimmediate_operand")
+ (vec_select:V32QI
+ (unspec:V64QI
+ [(match_operand:V64QI 1 "register_operand")
+ (match_operand:V64QI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)
+ (const_int 8) (const_int 9)
+ (const_int 10) (const_int 11)
+ (const_int 12) (const_int 13)
+ (const_int 14) (const_int 15)
+ (const_int 16) (const_int 17)
+ (const_int 18) (const_int 19)
+ (const_int 20) (const_int 21)
+ (const_int 22) (const_int 23)
+ (const_int 24) (const_int 25)
+ (const_int 26) (const_int 27)
+ (const_int 28) (const_int 29)
+ (const_int 30) (const_int 31)])))]
+ "TARGET_AVX512VBMI && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V32QI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V32HImode, operands[1], V64QImode);")
+
(define_insn "avx512bw_<code>v32hiv32qi2_mask"
[(set (match_operand:V32QI 0 "nonimmediate_operand" "=v,m")
(vec_merge:V32QI
@@ -11063,6 +11151,45 @@ (define_insn "*avx512vl_<code><ssedoublemodelower><mode>2"
(set_attr "prefix" "evex")
(set_attr "mode" "<sseinsnmode>")])
+(define_insn_and_split "*avx512f_permvar_truncv16hiv16qi_1"
+ [(set (match_operand:V16QI 0 "nonimmediate_operand")
+ (vec_select:V16QI
+ (unspec:V32QI
+ [(match_operand:V32QI 1 "register_operand")
+ (match_operand:V32QI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)
+ (const_int 8) (const_int 9)
+ (const_int 10) (const_int 11)
+ (const_int 12) (const_int 13)
+ (const_int 14) (const_int 15)])))]
+ "TARGET_AVX512VL && TARGET_AVX512VBMI
+ && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V16QI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V16HImode, operands[1], V32QImode);")
+
+(define_insn_and_split "*avx512f_permvar_truncv4div4si_1"
+ [(set (match_operand:V4SI 0 "nonimmediate_operand")
+ (vec_select:V4SI
+ (unspec:V8SI
+ [(match_operand:V8SI 1 "register_operand")
+ (match_operand:V8SI 2 "permvar_truncate_operand")]
+ UNSPEC_VPERMVAR)
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)])))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(set (match_dup 0)
+ (truncate:V4SI (match_dup 1)))]
+ "operands[1] = lowpart_subreg (V4DImode, operands[1], V8SImode);")
+
(define_insn "<avx512>_<code><ssedoublemodelower><mode>2_mask"
[(set (match_operand:PMOV_DST_MODE_2 0 "nonimmediate_operand" "=v,m")
(vec_merge:PMOV_DST_MODE_2
@@ -11121,6 +11248,27 @@ (define_insn "avx512vl_<code><mode>v<ssescalarnum>qi2"
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
+(define_insn_and_split "*avx512f_pshufb_truncv8hiv8qi_1"
+ [(set (match_operand:DI 0 "register_operand")
+ (vec_select:DI
+ (subreg:V2DI
+ (unspec:V16QI
+ [(match_operand:V16QI 1 "register_operand")
+ (match_operand:V16QI 2 "pshufb_truncv8hiv8qi_operand")]
+ UNSPEC_PSHUFB) 0)
+ (parallel [(const_int 0)])))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx op1 = gen_reg_rtx (V8QImode);
+ operands[1] = lowpart_subreg (V8HImode, operands[1], V16QImode);
+ emit_insn (gen_truncv8hiv8qi2 (op1, operands[1]));
+ emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V8QImode));
+ DONE;
+})
+
(define_insn "*avx512vl_<code>v2div2qi2_store_1"
[(set (match_operand:V2QI 0 "memory_operand" "=m")
(any_truncate:V2QI
@@ -11476,6 +11624,27 @@ (define_insn "avx512vl_<code><mode>v<ssescalarnum>hi2"
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
+(define_insn_and_split "*avx512f_pshufb_truncv4siv4hi_1"
+ [(set (match_operand:DI 0 "register_operand")
+ (vec_select:DI
+ (subreg:V2DI
+ (unspec:V16QI
+ [(match_operand:V16QI 1 "register_operand")
+ (match_operand:V16QI 2 "pshufb_truncv4siv4hi_operand")]
+ UNSPEC_PSHUFB) 0)
+ (parallel [(const_int 0)])))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx op1 = gen_reg_rtx (V4HImode);
+ operands[1] = lowpart_subreg (V4SImode, operands[1], V16QImode);
+ emit_insn (gen_truncv4siv4hi2 (op1, operands[1]));
+ emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V4HImode));
+ DONE;
+})
+
(define_insn "*avx512vl_<code><mode>v4hi2_store_1"
[(set (match_operand:V4HI 0 "memory_operand" "=m")
(any_truncate:V4HI
@@ -11699,6 +11868,27 @@ (define_insn "avx512vl_<code>v2div2si2"
(set_attr "prefix" "evex")
(set_attr "mode" "TI")])
+(define_insn_and_split "*avx512f_pshufd_truncv2div2si_1"
+ [(set (match_operand:DI 0 "register_operand")
+ (vec_select:DI
+ (subreg:V2DI
+ (vec_select:V4SI
+ (match_operand:V4SI 1 "register_operand")
+ (parallel [(const_int 0) (const_int 2)
+ (const_int 2) (const_int 3)])) 0)
+ (parallel [(const_int 0)])))]
+ "TARGET_AVX512VL && ix86_pre_reload_split ()"
+ "#"
+ "&& 1"
+ [(const_int 0)]
+{
+ rtx op1 = gen_reg_rtx (V2SImode);
+ operands[1] = lowpart_subreg (V2DImode, operands[1], V4SImode);
+ emit_insn (gen_truncv2div2si2 (op1, operands[1]));
+ emit_move_insn (operands[0], lowpart_subreg (DImode, op1, V2SImode));
+ DONE;
+})
+
(define_insn "*avx512vl_<code>v2div2si2_store_1"
[(set (match_operand:V2SI 0 "memory_operand" "=m")
(any_truncate:V2SI
diff --git a/gcc/testsuite/gcc.target/i386/pr101846-2.c b/gcc/testsuite/gcc.target/i386/pr101846-2.c
new file mode 100644
index 00000000000..26c9ed511e5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101846-2.c
@@ -0,0 +1,81 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times "vpmovwb" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovdw" "3" } } */
+/* { dg-final { scan-assembler-times "vpmovqd" "3" } } */
+
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef char v8qi __attribute__((vector_size (8)));
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+typedef int v2si __attribute__((vector_size (8)));
+typedef int v4si __attribute__((vector_size (16)));
+typedef int v8si __attribute__((vector_size (32)));
+typedef int v16si __attribute__((vector_size (64)));
+
+v16hi
+foo_dw_512 (v32hi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30);
+}
+
+v8hi
+foo_dw_256 (v16hi x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14);
+}
+
+v4hi
+foo_dw_128 (v8hi x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6);
+}
+
+v8si
+foo_qd_512 (v16si x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14);
+}
+
+v4si
+foo_qd_256 (v8si x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6);
+}
+
+v2si
+foo_qd_128 (v4si x)
+{
+ return __builtin_shufflevector (x, x, 0, 2);
+}
+
+v32qi
+foo_wb_512 (v64qi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62);
+}
+
+v16qi
+foo_wb_256 (v32qi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30);
+}
+
+v8qi
+foo_wb_128 (v16qi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101846-3.c b/gcc/testsuite/gcc.target/i386/pr101846-3.c
new file mode 100644
index 00000000000..f774018a382
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101846-3.c
@@ -0,0 +1,73 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times "vpermb" "2" } } */
+/* { dg-final { scan-assembler-times "vpermw" "2" } } */
+/* { dg-final { scan-assembler-times "vpermd" "2" } } */
+
+typedef short v4hi __attribute__((vector_size (8)));
+typedef short v8hi __attribute__((vector_size (16)));
+typedef short v16hi __attribute__((vector_size (32)));
+typedef short v32hi __attribute__((vector_size (64)));
+typedef char v8qi __attribute__((vector_size (8)));
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+typedef int v2si __attribute__((vector_size (8)));
+typedef int v4si __attribute__((vector_size (16)));
+typedef int v8si __attribute__((vector_size (32)));
+typedef int v16si __attribute__((vector_size (64)));
+
+v32hi
+foow_512 (v32hi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31);
+}
+
+v16hi
+foow_256 (v16hi x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14,
+ 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+
+v16si
+food_512 (v16si x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6, 8, 10, 12, 14,
+ 8, 9, 10, 11, 12, 13, 14, 15);
+}
+
+v8si
+food_256 (v8si x)
+{
+ return __builtin_shufflevector (x, x, 0, 2, 4, 6, 4, 5, 6, 7);
+}
+
+v64qi
+foob_512 (v64qi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63);
+}
+
+v32qi
+foob_256 (v32qi x)
+{
+ return __builtin_shufflevector (x, x,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31);
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101846-4.c b/gcc/testsuite/gcc.target/i386/pr101846-4.c
new file mode 100644
index 00000000000..2a6163c4d72
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101846-4.c
@@ -0,0 +1,40 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512vl -mavx512vbmi -O2" } */
+/* { dg-final { scan-assembler-times "vpermi2b" "3" } } */
+
+typedef char v16qi __attribute__((vector_size (16)));
+typedef char v32qi __attribute__((vector_size (32)));
+typedef char v64qi __attribute__((vector_size (64)));
+
+
+v64qi
+foob_512 (v64qi x, v64qi y)
+{
+ return __builtin_shufflevector (x, y,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62,
+ 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 77, 79, 74, 72, 70,
+ 89, 88, 78, 86, 85, 75, 83, 82,
+ 112, 108, 101, 100, 86, 96, 97, 95);
+}
+
+v32qi
+foob_256 (v32qi x, v32qi y)
+{
+ return __builtin_shufflevector (x, y,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30,
+ 32, 34, 36, 38, 40, 42, 44, 46,
+ 48, 50, 52, 54, 56, 58, 60, 62);
+}
+
+v16qi
+foob_128 (v16qi x, v16qi y)
+{
+ return __builtin_shufflevector (x, y,
+ 0, 2, 4, 6, 8, 10, 12, 14,
+ 16, 18, 20, 22, 24, 26, 28, 30);
+}
--
2.27.0
next prev parent reply other threads:[~2021-08-16 5:18 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-08-11 6:43 [PATCH] [i386] Combine avx_vec_concatv16si and avx512f_zero_extendv16hiv16si2_1 to avx512f_zero_extendv16hiv16si2_2 liuhongt
2021-08-11 7:58 ` Jakub Jelinek
2021-08-11 9:32 ` Hongtao Liu
2021-08-12 5:43 ` [PATCH] [i386] Optimize vec_perm_expr to match vpmov{dw,qd,wb} liuhongt
2021-08-12 9:22 ` Jakub Jelinek
2021-08-12 9:41 ` Jakub Jelinek
2021-08-13 1:42 ` Hongtao Liu
2021-08-13 8:47 ` Jakub Jelinek
2021-08-13 9:03 ` Richard Sandiford
2021-08-16 7:19 ` Richard Biener
2021-08-16 5:18 ` liuhongt [this message]
2021-08-16 7:10 ` [PATCH] [i386] Optimize __builtin_shuffle_vector Jakub Jelinek
2021-08-16 7:25 ` Hongtao Liu
2021-08-17 1:45 ` Hongtao Liu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210816051838.868413-1-hongtao.liu@intel.com \
--to=hongtao.liu@intel.com \
--cc=crazylht@gmail.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=jakub@rehat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).