* [PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen.
@ 2014-10-06 12:55 Kirill Yukhin
2014-10-06 14:10 ` Jakub Jelinek
0 siblings, 1 reply; 11+ messages in thread
From: Kirill Yukhin @ 2014-10-06 12:55 UTC (permalink / raw)
To: Uros Bizjak; +Cc: Jakub Jelinek, Richard Henderson, GCC Patches, kirill.yukhin
Hello,
This patch extends permutations for AVX-512*.
Comments are welcome!
Bootstrapped.
AVX-512* tests on top of patch-set all pass
under simulator.
Is it ok for trunk?
gcc/
* config/i386/i386.c
(ix86_expand_vec_perm_vpermi2): Handle V64QImode, V8HImode, V16HImode,
V32HImode, V4SImode, V8SImode, V4SFmode, V8SFmode, V2DImode, V4DImode,
V2DFmode, V4DFmode.
(ix86_expand_sse_unpack): Handle V64QImode.
(expand_vec_perm_blend): Update conditions for TARGET, handle
V8DFmode, V16SFmode, V32HImode, V64QImode, V16SImode, V8DImode.
(expand_vec_perm_pshufb): Handle V64QImode.
(expand_vec_perm_1): Handle V64QImode, V32HImode, V16SImode, V16SFmode,
V8DFmode, V8DImode, V4DFmode, V2DFmode, V8SFmode, V4SFmode.
(ix86_expand_vec_perm_const_1): Call ix86_expand_vec_perm_vpermi2.
(ix86_vectorize_vec_perm_const_ok): Handle V32HImode, V64QImode.
(ix86_expand_vecop_qihi): Handle V64QImode.
* config/i386/sse.md
(define_mode_iterator VI1_AVX2): Add V64QI mode.
(define_mode_iterator VEC_PERM_AVX2): Add V32HI mode.
(define_mode_iterator VEC_PERM_CONST): Add V64QI and V32HI mode.
(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"): Add masking.
--
Thanks, K
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 352ab81..d759a45 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
enum machine_mode mode = GET_MODE (op0);
switch (mode)
{
+ /* There is no byte version of vpermi2. So we use vpermi2w. */
+ case V64QImode:
+ if (!TARGET_AVX512BW)
+ return false;
+ rtx mask_lowpart, op0_lowpart, op1_lowpart;
+ rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi;
+
+ mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask));
+ op0_lowpart = gen_lowpart (V32HImode, op0);
+ op1_lowpart = gen_lowpart (V32HImode, op1);
+ tmp = gen_reg_rtx (V32HImode);
+ tmp2 = gen_reg_rtx (V32HImode);
+ perm_lo = gen_reg_rtx (V32HImode);
+ perm_hi = gen_reg_rtx (V32HImode);
+ res_lo = gen_reg_rtx (V32HImode);
+ res_hi = gen_reg_rtx (V32HImode);
+
+ emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8)));
+ emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9)));
+ emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9)));
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart,
+ perm_lo, op1_lowpart));
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart,
+ perm_hi, op1_lowpart));
+ emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8)));
+ emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo),
+ gen_lowpart (V64QImode, res_hi),
+ force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL))));
+ return true;
+ case V8HImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
+ force_reg (V8HImode, mask), op1));
+ return true;
+ case V16HImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
+ force_reg (V16HImode, mask), op1));
+ return true;
+ case V32HImode:
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (target, op0,
+ force_reg (V32HImode, mask), op1));
+ return true;
+ case V4SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4si3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8si3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SImode:
emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V4SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4sf3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv8sf3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SFmode:
emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V2DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv2di3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4di3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DImode:
emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
force_reg (V8DImode, mask), op1));
return true;
+ case V2DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv2df3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ emit_insn (gen_avx512vl_vpermi2varv4df3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DFmode:
emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
force_reg (V8DImode, mask), op1));
@@ -21779,6 +21872,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
switch (imode)
{
+ case V64QImode:
+ if (unsigned_p)
+ unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+ else
+ unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+ halfmode = V32QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+ break;
case V32QImode:
if (unsigned_p)
unpack = gen_avx2_zero_extendv16qiv16hi2;
@@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
+ GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
+ ;
+ else if (TARGET_AVX512VL)
+ ;
+ else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
@@ -42693,12 +42800,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
switch (vmode)
{
+ case V8DFmode:
+ case V16SFmode:
case V4DFmode:
case V8SFmode:
case V2DFmode:
case V4SFmode:
case V8HImode:
case V8SImode:
+ case V32HImode:
+ case V64QImode:
+ case V16SImode:
+ case V8DImode:
for (i = 0; i < nelt; ++i)
mask |= (d->perm[i] >= nelt) << i;
break;
@@ -42921,9 +43034,9 @@ static bool
expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
{
unsigned i, nelt, eltsz, mask;
- unsigned char perm[32];
+ unsigned char perm[64];
enum machine_mode vmode = V16QImode;
- rtx rperm[32], vperm, target, op0, op1;
+ rtx rperm[64], vperm, target, op0, op1;
nelt = d->nelt;
@@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return false;
}
}
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+ if (vmode == V64QImode)
+ {
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
+ }
+ }
else
return false;
}
@@ -43029,6 +43153,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
mask = 2 * nelt - 1;
else if (vmode == V16QImode)
mask = nelt - 1;
+ else if (vmode == V64QImode)
+ mask = nelt / 4 - 1;
else
mask = nelt / 2 - 1;
@@ -43054,6 +43180,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else if (vmode == V64QImode)
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
else if (vmode == V8SFmode)
emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
else
@@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
+ case V64QImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512bw_vec_dupv64qi;
+ break;
case V32QImode:
gen = gen_avx2_pbroadcastv32qi_1;
break;
+ case V32HImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512bw_vec_dupv32hi;
+ break;
case V16HImode:
gen = gen_avx2_pbroadcastv16hi_1;
break;
+ case V16SImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16si;
+ break;
case V8SImode:
gen = gen_avx2_pbroadcastv8si_1;
break;
@@ -43124,9 +43264,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
case V8HImode:
gen = gen_avx2_pbroadcastv8hi;
break;
+ case V16SFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16sf;
+ break;
case V8SFmode:
gen = gen_avx2_vec_dupv8sf_1;
break;
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8df;
+ break;
+ case V8DImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8di;
+ break;
/* For other modes prefer other shuffles this function creates. */
default: break;
}
@@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
mode = V8DImode;
else if (mode == V16SFmode)
mode = V16SImode;
+ else if (mode == V4DFmode)
+ mode = V4DImode;
+ else if (mode == V2DFmode)
+ mode = V2DImode;
+ else if (mode == V8SFmode)
+ mode = V8SImode;
+ else if (mode == V4SFmode)
+ mode = V4SImode;
for (i = 0; i < nelt; ++i)
vec[i] = GEN_INT (d->perm[i]);
rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
@@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
return true;
/* Try sequences of two instructions. */
+ /* ix86_expand_vec_perm_vpermi2 is also called from
+ * ix86_expand_vec_perm. So it doesn't take d as parameter.
+ * Construct needed params. */
+ rtx vec[64];
+ int i;
+ for (i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec));
+ if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1))
+ return true;
if (expand_vec_perm_pshuflw_pshufhw (d))
return true;
@@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
/* Given sufficient ISA support we can just return true here
for selected vector modes. */
if (d.vmode == V16SImode || d.vmode == V16SFmode
- || d.vmode == V8DFmode || d.vmode == V8DImode)
+ || d.vmode == V8DFmode || d.vmode == V8DImode
+ || d.vmode == V32HImode || d.vmode == V64QImode)
/* All implementable with a single vpermi2 insn. */
return true;
if (GET_MODE_SIZE (d.vmode) == 16)
@@ -45066,6 +45237,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ break;
default:
gcc_unreachable ();
}
@@ -45126,7 +45302,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2;
}
else
@@ -45134,7 +45310,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
Happily, that is even one insn shorter than even extraction. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
}
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb6372a..d3e9635 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -298,7 +298,7 @@
[V8DI (V4DI "TARGET_AVX512VL")])
(define_mode_iterator VI1_AVX2
- [(V32QI "TARGET_AVX2") V16QI])
+ [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI])
(define_mode_iterator VI2_AVX2
[(V16HI "TARGET_AVX2") V8HI])
@@ -10621,7 +10621,8 @@
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
(V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
- (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+ (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")])
(define_expand "vec_perm<mode>"
[(match_operand:VEC_PERM_AVX2 0 "register_operand")
@@ -10642,7 +10643,8 @@
(V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
- (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+ (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])
(define_expand "vec_perm_const<mode>"
[(match_operand:VEC_PERM_CONST 0 "register_operand")
@@ -13559,21 +13561,21 @@
(set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
(set_attr "mode" "DI")])
-(define_insn "<ssse3_avx2>_pshufb<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
+(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"
+ [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v")
(unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
- (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+ [(match_operand:VI1_AVX2 1 "register_operand" "0,v")
+ (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,vm")]
UNSPEC_PSHUFB))]
- "TARGET_SSSE3"
+ "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
"@
pshufb\t{%2, %0|%0, %2}
- vpshufb\t{%2, %1, %0|%0, %1, %2}"
+ vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sselog1")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix_extra" "1")
- (set_attr "prefix" "orig,vex")
+ (set_attr "prefix" "orig,maybe_evex")
(set_attr "btver2_decode" "vector,vector")
(set_attr "mode" "<sseinsnmode>")])
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen.
2014-10-06 12:55 [PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen Kirill Yukhin
@ 2014-10-06 14:10 ` Jakub Jelinek
2014-10-09 12:19 ` [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi) Ilya Tocar
0 siblings, 1 reply; 11+ messages in thread
From: Jakub Jelinek @ 2014-10-06 14:10 UTC (permalink / raw)
To: Kirill Yukhin; +Cc: Uros Bizjak, Richard Henderson, GCC Patches
On Mon, Oct 06, 2014 at 04:55:28PM +0400, Kirill Yukhin wrote:
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
> enum machine_mode mode = GET_MODE (op0);
> switch (mode)
> {
> + /* There is no byte version of vpermi2. So we use vpermi2w. */
> + case V64QImode:
> + if (!TARGET_AVX512BW)
> + return false;
> + rtx mask_lowpart, op0_lowpart, op1_lowpart;
> + rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi;
> +
> + mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask));
> + op0_lowpart = gen_lowpart (V32HImode, op0);
> + op1_lowpart = gen_lowpart (V32HImode, op1);
> + tmp = gen_reg_rtx (V32HImode);
> + tmp2 = gen_reg_rtx (V32HImode);
> + perm_lo = gen_reg_rtx (V32HImode);
> + perm_hi = gen_reg_rtx (V32HImode);
> + res_lo = gen_reg_rtx (V32HImode);
> + res_hi = gen_reg_rtx (V32HImode);
> +
> + emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8)));
> + emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9)));
> + emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9)));
> + emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart,
> + perm_lo, op1_lowpart));
> + emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart,
> + perm_hi, op1_lowpart));
> + emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8)));
> + emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo),
> + gen_lowpart (V64QImode, res_hi),
> + force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL))));
> + return true;
I believe this case doesn't belong to this function, other than this
case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and
so it should always do that, and there should be a separate function
that expands the worst case of V64QImode full 2 operand permutation.
See my previous mail, IMHO it is doable with 5 instructions rather than 7.
And IMHO we should have a separate function which emits that, supposedly
one for the constant permutations, one for the variable case (perhaps
then your 7 insn sequence is best?).
Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers,
supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes
right now plus D, either D would be NULL (then it would behave as now), or
SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed.
I.e. the function would start with a switch that would just contain the
if (...)
return false;
hunks plus break; for the success case, then code to generate CONST_VECTOR
if sel is NULL_RTX from d, and finally another switch with just the emit
cases. Or, the first switch could just set a function pointer before
break, and just use one common
emit_insn (gen (target, op0, force_reg (vmode, mask), op1));
> + case V8HImode:
> + if (!TARGET_AVX512VL)
> + return false;
> + emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
> + force_reg (V8HImode, mask), op1));
> + return true;
> + case V16HImode:
> + if (!TARGET_AVX512VL)
> + return false;
> + emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
> + force_reg (V16HImode, mask), op1));
> + return true;
Aren't these two insns there only if both TARGET_AVX512VL && TARGET_AVX512BW?
I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I
think neither of these depends on the other one in GCC. That's unlike insns
where CPUID AVX512VL and AVX512F are mentioned together, because in GCC
AVX512VL depends on AVX512F.
> @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
>
> if (d->one_operand_p)
> return false;
> - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
> + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
> + GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
Formatting, && belongs on the second line.
> + ;
> + else if (TARGET_AVX512VL)
I'd add && GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here.
AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones,
and the == 32 and == 16 cases are handled because AVX512VL implies
TARGET_AVX2 and TARGET_SSE4_1, doesn't it?
> @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
> return false;
> }
> }
> + else if (GET_MODE_SIZE (d->vmode) == 64)
> + {
> + if (!TARGET_AVX512BW)
> + return false;
> + if (vmode == V64QImode)
> + {
> + for (i = 0; i < nelt; ++i)
> + if ((d->perm[i] ^ i) & (nelt / 4))
> + return false;
Missing comment, I'd duplicate the
/* vpshufb only works intra lanes, it is not
possible to shuffle bytes in between the lanes. */
comment there.
> @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
> rtx (*gen) (rtx, rtx) = NULL;
> switch (d->vmode)
> {
> + case V64QImode:
> + if (TARGET_AVX512VL)
VL? Isn't that BW?
> + gen = gen_avx512bw_vec_dupv64qi;
> + break;
> case V32QImode:
> gen = gen_avx2_pbroadcastv32qi_1;
> break;
> + case V32HImode:
> + if (TARGET_AVX512VL)
Ditto.
> @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
> mode = V8DImode;
> else if (mode == V16SFmode)
> mode = V16SImode;
> + else if (mode == V4DFmode)
> + mode = V4DImode;
> + else if (mode == V2DFmode)
> + mode = V2DImode;
> + else if (mode == V8SFmode)
> + mode = V8SImode;
> + else if (mode == V4SFmode)
> + mode = V4SImode;
> for (i = 0; i < nelt; ++i)
> vec[i] = GEN_INT (d->perm[i]);
> rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
See above comment about CONST_VECTOR.
> @@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> return true;
>
> /* Try sequences of two instructions. */
> + /* ix86_expand_vec_perm_vpermi2 is also called from
> + * ix86_expand_vec_perm. So it doesn't take d as parameter.
> + * Construct needed params. */
> + rtx vec[64];
> + int i;
> + for (i = 0; i < d->nelt; ++i)
> + vec[i] = GEN_INT (d->perm[i]);
> + rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec));
> + if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1))
> + return true;
>
> if (expand_vec_perm_pshuflw_pshufhw (d))
> return true;
I don't understand this. Doesn't ix86_expand_vec_perm_vpermi2 generate
(except for the V64QI case discussed above) a single insn? Then
expand_vec_perm_1 should have handled that already, so this is just a waste
of resources here.
> @@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
> /* Given sufficient ISA support we can just return true here
> for selected vector modes. */
> if (d.vmode == V16SImode || d.vmode == V16SFmode
> - || d.vmode == V8DFmode || d.vmode == V8DImode)
> + || d.vmode == V8DFmode || d.vmode == V8DImode
> + || d.vmode == V32HImode || d.vmode == V64QImode)
> /* All implementable with a single vpermi2 insn. */
> return true;
1) Shouldn't this be guarded with TARGET_AVX512F &&
and in the V32HImode/V64QImode also with TARGET_AVX512BW?
The comment is not correct for V64QImode.
2) For TARGET_AVX512VL, vpermi2 can handle also smaller mode sizes.
Perhaps it would be best to turn this into
switch (d.vmode)
{
case V16SImode:
case V16SFmode:
case V8DFmode:
case V8DImode:
if (TARGET_AVX512F)
/* All implementable with a single vpermi2 insn. */
return true;
break;
case V32HImode:
if (TARGET_AVX512BW)
/* Implementable with a single vpermi2 insn. */
return true;
break;
case V64HImode:
if (TARGET_AVX512BW)
/* Implementable with 2 vpermi2w, 2 vpshufb and one vpor insns. */
return true;
break;
case V8SImode:
case V8SFmode:
case V4DFmode:
case V4DImode:
if (TARGET_AVX512VL)
/* Implementable with a single vpermi2 insn. */
return true;
break;
case V16HImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
/* Implementable with a single vpermi2 insn. */
return true;
if (TARGET_AVX2)
/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
return true;
break;
case V32QImode:
if (TARGET_AVX2)
/* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
return true;
break;
case V4SImode:
case V4SFmode:
case V8HImode:
case V16QImode:
/* All implementable with a single vpperm insn. */
if (TARGET_XOP)
return true;
/* All implementable with 2 pshufb + 1 ior. */
if (TARGET_SSSE3)
return true;
break;
case V2DImode:
case V2DFmode:
/* All implementable with shufpd or unpck[lh]pd. */
return true;
}
Now, for V8SI/V8SF/V4DI/V4DF, I wonder if we have (for either AVX or AVX2)
any expanders that guarantee we generate some sequence for all possible
2 operand constant permutations. I think ix86_expand_vec_perm is able
to emit the non-constant permutations for all of these, so in theory
we should have an upper bound for all these.
Jakub
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
2014-10-06 14:10 ` Jakub Jelinek
@ 2014-10-09 12:19 ` Ilya Tocar
2014-10-09 18:51 ` Jakub Jelinek
0 siblings, 1 reply; 11+ messages in thread
From: Ilya Tocar @ 2014-10-09 12:19 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: Kirill Yukhin, Uros Bizjak, Richard Henderson, GCC Patches
Hi,
I think this patch should be split in 2 parts:
V64QI related and non-V64QI related.
This part contains non-V64QI related changes.
Also I've noticed, that not all patterns using VI1_AVX2,
actually have AVX512 versions, so fixed bogus patterns.
On 06 Oct 16:10, Jakub Jelinek wrote:
> On Mon, Oct 06, 2014 at 04:55:28PM +0400, Kirill Yukhin wrote:
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
> > enum machine_mode mode = GET_MODE (op0);
> > switch (mode)
> > {
> > + /* There is no byte version of vpermi2. So we use vpermi2w. */
> > + case V64QImode:
...
>
> I believe this case doesn't belong to this function, other than this
> case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and
> so it should always do that, and there should be a separate function
> that expands the worst case of V64QImode full 2 operand permutation.
> See my previous mail, IMHO it is doable with 5 instructions rather than 7.
> And IMHO we should have a separate function which emits that, supposedly
> one for the constant permutations, one for the variable case (perhaps
> then your 7 insn sequence is best?).
This will be done in following patch.
>
> Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers,
> supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes
> right now plus D, either D would be NULL (then it would behave as now), or
> SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed.
> I.e. the function would start with a switch that would just contain the
> if (...)
> return false;
> hunks plus break; for the success case, then code to generate CONST_VECTOR
> if sel is NULL_RTX from d, and finally another switch with just the emit
> cases.
Done.
>
> > + case V8HImode:
> > + if (!TARGET_AVX512VL)
> > + return false;
> > + emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
> > + force_reg (V8HImode, mask), op1));
> > + return true;
> > + case V16HImode:
> > + if (!TARGET_AVX512VL)
> > + return false;
> > + emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
> > + force_reg (V16HImode, mask), op1));
> > + return true;
>
> Aren't these two insns there only if both TARGET_AVX512VL && TARGET_AVX512BW?
> I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I
> think neither of these depends on the other one in GCC. That's unlike insns
> where CPUID AVX512VL and AVX512F are mentioned together, because in GCC
> AVX512VL depends on AVX512F.
>
Good catch!
> > @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
> >
> > if (d->one_operand_p)
> > return false;
> > - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
> > + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 &&
> > + GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
>
> Formatting, && belongs on the second line.
>
Fixed.
> > + ;
> > + else if (TARGET_AVX512VL)
>
> I'd add && GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here.
> AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones,
> and the == 32 and == 16 cases are handled because AVX512VL implies
> TARGET_AVX2 and TARGET_SSE4_1, doesn't it?
>
As TARGET_AVX512VL always implies TARGET_AVX2 and TARGET_SSE4_1 and
works only on 32/16-byte mode this case is redundant, so I've removed
it.
> > @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
> > return false;
> > }
> > }
> > + else if (GET_MODE_SIZE (d->vmode) == 64)
> > + {
> > + if (!TARGET_AVX512BW)
> > + return false;
> > + if (vmode == V64QImode)
> > + {
> > + for (i = 0; i < nelt; ++i)
> > + if ((d->perm[i] ^ i) & (nelt / 4))
> > + return false;
>
> Missing comment, I'd duplicate the
> /* vpshufb only works intra lanes, it is not
> possible to shuffle bytes in between the lanes. */
> comment there.
>
Done.
> > @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
> > rtx (*gen) (rtx, rtx) = NULL;
> > switch (d->vmode)
> > {
> > + case V64QImode:
> > + if (TARGET_AVX512VL)
>
> VL? Isn't that BW?
>
> > + gen = gen_avx512bw_vec_dupv64qi;
> > + break;
> > case V32QImode:
> > gen = gen_avx2_pbroadcastv32qi_1;
> > break;
> > + case V32HImode:
> > + if (TARGET_AVX512VL)
>
> Ditto.
>
Fixed.
> > @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
> > mode = V8DImode;
> > else if (mode == V16SFmode)
> > mode = V16SImode;
> > + else if (mode == V4DFmode)
> > + mode = V4DImode;
> > + else if (mode == V2DFmode)
> > + mode = V2DImode;
> > + else if (mode == V8SFmode)
> > + mode = V8SImode;
> > + else if (mode == V4SFmode)
> > + mode = V4SImode;
> > for (i = 0; i < nelt; ++i)
> > vec[i] = GEN_INT (d->perm[i]);
> > rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
>
> See above comment about CONST_VECTOR.
>
Done.
> > @@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> > return true;
> >
> > /* Try sequences of two instructions. */
> > + /* ix86_expand_vec_perm_vpermi2 is also called from
> > + * ix86_expand_vec_perm. So it doesn't take d as parameter.
> > + * Construct needed params. */
> > + rtx vec[64];
> > + int i;
> > + for (i = 0; i < d->nelt; ++i)
> > + vec[i] = GEN_INT (d->perm[i]);
> > + rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec));
> > + if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1))
> > + return true;
> >
> > if (expand_vec_perm_pshuflw_pshufhw (d))
> > return true;
>
> I don't understand this. Doesn't ix86_expand_vec_perm_vpermi2 generate
> (except for the V64QI case discussed above) a single insn? Then
> expand_vec_perm_1 should have handled that already, so this is just a waste
> of resources here.
>
Removed.
> > @@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
> > /* Given sufficient ISA support we can just return true here
> > for selected vector modes. */
> > if (d.vmode == V16SImode || d.vmode == V16SFmode
> > - || d.vmode == V8DFmode || d.vmode == V8DImode)
> > + || d.vmode == V8DFmode || d.vmode == V8DImode
> > + || d.vmode == V32HImode || d.vmode == V64QImode)
> > /* All implementable with a single vpermi2 insn. */
> > return true;
>
> 1) Shouldn't this be guarded with TARGET_AVX512F &&
> and in the V32HImode/V64QImode also with TARGET_AVX512BW?
> The comment is not correct for V64QImode.
>
This are probably no 512-bit modes without AVX512F, but I've
refactored it as per your suggestion below.
> 2) For TARGET_AVX512VL, vpermi2 can handle also smaller mode sizes.
> Perhaps it would be best to turn this into
> switch (d.vmode)
> {
> case V16SImode:
> case V16SFmode:
> case V8DFmode:
> case V8DImode:
> if (TARGET_AVX512F)
> /* All implementable with a single vpermi2 insn. */
> break;
...
>
> Now, for V8SI/V8SF/V4DI/V4DF, I wonder if we have (for either AVX or AVX2)
> any expanders that guarantee we generate some sequence for all possible
> 2 operand constant permutations. I think ix86_expand_vec_perm is able
> to emit the non-constant permutations for all of these, so in theory
> we should have an upper bound for all these.
>
I'm not sure about it, so for now I've left V8SI/V8SF/V4DI/V4DF out.
Updated patch below:
gcc/
* config/i386/i386.c
(MAX_VECT_LEN): Move above ix86_expand_vec_perm_vpermi2.
(struct expand_vec_perm_d): Ditto.
(ix86_expand_vec_perm_vpermi2): Handle V8HImode, V16HImode, V2DFmode,
V32HImode, V4SImode, V8SImode, V4SFmode, V8SFmode, V2DImode, V4DImode,
V4DFmode.
(ix86_expand_vec_perm): Update ix86_expand_vec_perm_vpermi2 signature.
(ix86_expand_sse_unpack): Handle V64QImode.
(expand_vec_perm_blend): Update conditions for TARGET, handle
V8DFmode, V16SFmode, V32HImode, V64QImode, V16SImode, V8DImode.
(expand_vec_perm_pshufb): Handle V64QImode.
(expand_vec_perm_1): Handle V64QImode, V32HImode, V16SImode, V16SFmode,
V8DFmode, V8DImode, V4DFmode, V2DFmode, V8SFmode, V4SFmode.
(ix86_expand_vec_perm_const_1): Call ix86_expand_vec_perm_vpermi2.
(ix86_vectorize_vec_perm_const_ok): Handle V32HImode, V64QImode.
(ix86_expand_vecop_qihi): Handle V64QImode.
* config/i386/sse.md
(define_mode_iterator VI1_AVX512): New.
(define_mode_iterator VEC_PERM_AVX2): Add V32HI mode.
(define_mode_iterator VEC_PERM_CONST): Add V32HI mode.
(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"): Add masking.
(mul<mode>3): Use VI1_AVX512.
(<sse2_avx2>_packsswb): Ditto.
(<sse2_avx2>_packuswb): Ditto.
(<ssse3_avx2>_pshufb<mode>3): Ditto.
(<shift_insn><mode>3): Ditto.
---
gcc/config/i386/i386.c | 293 ++++++++++++++++++++++++++++++++++++++++++-------
gcc/config/i386/sse.md | 45 ++++----
2 files changed, 278 insertions(+), 60 deletions(-)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 352ab81..426ea9e 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[])
return true;
}
+/* AVX512F does support 64-byte integer vector operations,
+ thus the longest vector we are faced with is V64QImode. */
+#define MAX_VECT_LEN 64
+
+struct expand_vec_perm_d
+{
+ rtx target, op0, op1;
+ unsigned char perm[MAX_VECT_LEN];
+ enum machine_mode vmode;
+ unsigned char nelt;
+ bool one_operand_p;
+ bool testing_p;
+};
+
static bool
-ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
+ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, struct expand_vec_perm_d *d)
{
- enum machine_mode mode = GET_MODE (op0);
+ enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
+
switch (mode)
{
+ case V8HImode:
+ if (!TARGET_AVX512VL || !TARGET_AVX512BW)
+ return false;
+ break;
+ case V16HImode:
+ if (!TARGET_AVX512VL || !TARGET_AVX512BW)
+ return false;
+ case V32HImode:
+ if (!TARGET_AVX512BW)
+ return false;
+ break;
+ case V4SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V8SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V16SImode:
+ if (!TARGET_AVX512F)
+ return false;
+ break;
+ case V4SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V8SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V16SFmode:
+ if (!TARGET_AVX512F)
+ return false;
+ break;
+ case V2DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V4DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V8DImode:
+ if (!TARGET_AVX512F)
+ return false;
+ break;
+ case V2DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V4DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ break;
+ case V8DFmode:
+ if (!TARGET_AVX512F)
+ return false;
+ break;
+ default:
+ return false;
+ }
+
+ /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const expander,
+ so args are either in d, or in op0, op1 etc. */
+ if (d)
+ {
+ rtx vec[64];
+ target = d->target;
+ op0 = d->op0;
+ op1 = d->op1;
+ for (int i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (d->nelt, vec));
+ }
+
+ switch (mode)
+ {
+ case V8HImode:
+ emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0,
+ force_reg (V8HImode, mask), op1));
+ return true;
+ case V16HImode:
+ emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0,
+ force_reg (V16HImode, mask), op1));
+ return true;
+ case V32HImode:
+ emit_insn (gen_avx512bw_vpermi2varv32hi3 (target, op0,
+ force_reg (V32HImode, mask), op1));
+ return true;
+ case V4SImode:
+ emit_insn (gen_avx512vl_vpermi2varv4si3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SImode:
+ emit_insn (gen_avx512vl_vpermi2varv8si3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SImode:
emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V4SFmode:
+ emit_insn (gen_avx512vl_vpermi2varv4sf3 (target, op0,
+ force_reg (V4SImode, mask), op1));
+ return true;
+ case V8SFmode:
+ emit_insn (gen_avx512vl_vpermi2varv8sf3 (target, op0,
+ force_reg (V8SImode, mask), op1));
+ return true;
case V16SFmode:
emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
force_reg (V16SImode, mask),
op1));
return true;
+ case V2DImode:
+ emit_insn (gen_avx512vl_vpermi2varv2di3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DImode:
+ emit_insn (gen_avx512vl_vpermi2varv4di3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DImode:
emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
force_reg (V8DImode, mask), op1));
return true;
+ case V2DFmode:
+ emit_insn (gen_avx512vl_vpermi2varv2df3 (target, op0,
+ force_reg (V2DImode, mask), op1));
+ return true;
+ case V4DFmode:
+ emit_insn (gen_avx512vl_vpermi2varv4df3 (target, op0,
+ force_reg (V4DImode, mask), op1));
+ return true;
case V8DFmode:
emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
force_reg (V8DImode, mask), op1));
return true;
default:
- return false;
+ gcc_unreachable ();
}
}
@@ -21407,7 +21544,7 @@ ix86_expand_vec_perm (rtx operands[])
e = GET_MODE_UNIT_SIZE (mode);
gcc_assert (w <= 64);
- if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1))
+ if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
return;
if (TARGET_AVX2)
@@ -21779,6 +21916,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
switch (imode)
{
+ case V64QImode:
+ if (unsigned_p)
+ unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+ else
+ unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+ halfmode = V32QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+ break;
case V32QImode:
if (unsigned_p)
unpack = gen_avx2_zero_extendv16qiv16hi2;
@@ -39603,20 +39749,6 @@ x86_emit_floatuns (rtx operands[2])
emit_label (donelab);
}
\f
-/* AVX512F does support 64-byte integer vector operations,
- thus the longest vector we are faced with is V64QImode. */
-#define MAX_VECT_LEN 64
-
-struct expand_vec_perm_d
-{
- rtx target, op0, op1;
- unsigned char perm[MAX_VECT_LEN];
- enum machine_mode vmode;
- unsigned char nelt;
- bool one_operand_p;
- bool testing_p;
-};
-
static bool canonicalize_perm (struct expand_vec_perm_d *d);
static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
@@ -42662,7 +42794,10 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
+ && GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
+ ;
+ else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
@@ -42693,12 +42828,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
switch (vmode)
{
+ case V8DFmode:
+ case V16SFmode:
case V4DFmode:
case V8SFmode:
case V2DFmode:
case V4SFmode:
case V8HImode:
case V8SImode:
+ case V32HImode:
+ case V64QImode:
+ case V16SImode:
+ case V8DImode:
for (i = 0; i < nelt; ++i)
mask |= (d->perm[i] >= nelt) << i;
break;
@@ -42921,9 +43062,9 @@ static bool
expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
{
unsigned i, nelt, eltsz, mask;
- unsigned char perm[32];
+ unsigned char perm[64];
enum machine_mode vmode = V16QImode;
- rtx rperm[32], vperm, target, op0, op1;
+ rtx rperm[64], vperm, target, op0, op1;
nelt = d->nelt;
@@ -43012,6 +43153,19 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return false;
}
}
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+ if (vmode == V64QImode)
+ {
+ /* vpshufb only works intra lanes, it is not
+ possible to shuffle bytes in between the lanes. */
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
+ }
+ }
else
return false;
}
@@ -43029,6 +43183,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
mask = 2 * nelt - 1;
else if (vmode == V16QImode)
mask = nelt - 1;
+ else if (vmode == V64QImode)
+ mask = nelt / 4 - 1;
else
mask = nelt / 2 - 1;
@@ -43054,6 +43210,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else if (vmode == V64QImode)
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
else if (vmode == V8SFmode)
emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
else
@@ -43109,12 +43267,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
+ case V64QImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vec_dupv64qi;
+ break;
case V32QImode:
gen = gen_avx2_pbroadcastv32qi_1;
break;
+ case V32HImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vec_dupv32hi;
+ break;
case V16HImode:
gen = gen_avx2_pbroadcastv16hi_1;
break;
+ case V16SImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16si;
+ break;
case V8SImode:
gen = gen_avx2_pbroadcastv8si_1;
break;
@@ -43124,9 +43294,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
case V8HImode:
gen = gen_avx2_pbroadcastv8hi;
break;
+ case V16SFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16sf;
+ break;
case V8SFmode:
gen = gen_avx2_vec_dupv8sf_1;
break;
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8df;
+ break;
+ case V8DImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8di;
+ break;
/* For other modes prefer other shuffles this function creates. */
default: break;
}
@@ -43210,16 +43392,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
return true;
/* Try the AVX512F vpermi2 instructions. */
- rtx vec[64];
- enum machine_mode mode = d->vmode;
- if (mode == V8DFmode)
- mode = V8DImode;
- else if (mode == V16SFmode)
- mode = V16SImode;
- for (i = 0; i < nelt; ++i)
- vec[i] = GEN_INT (d->perm[i]);
- rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
- if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1))
+ if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
return true;
return false;
@@ -44932,21 +45105,56 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
/* Given sufficient ISA support we can just return true here
for selected vector modes. */
- if (d.vmode == V16SImode || d.vmode == V16SFmode
- || d.vmode == V8DFmode || d.vmode == V8DImode)
- /* All implementable with a single vpermi2 insn. */
- return true;
- if (GET_MODE_SIZE (d.vmode) == 16)
+ switch (d.vmode)
{
+ case V16SFmode:
+ case V16SImode:
+ case V8DImode:
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V32HImode:
+ if (TARGET_AVX512BW)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V8SImode:
+ case V8SFmode:
+ case V4DFmode:
+ case V4DImode:
+ if (TARGET_AVX512VL)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V16HImode:
+ if (TARGET_AVX2)
+ /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
+ return true;
+ break;
+ case V32QImode:
+ if (TARGET_AVX2)
+ /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
+ return true;
+ break;
+ case V4SImode:
+ case V4SFmode:
+ case V8HImode:
+ case V16QImode:
/* All implementable with a single vpperm insn. */
if (TARGET_XOP)
return true;
/* All implementable with 2 pshufb + 1 ior. */
if (TARGET_SSSE3)
return true;
+ break;
+ case V2DImode:
+ case V2DFmode:
/* All implementable with shufpd or unpck[lh]pd. */
- if (d.nelt == 2)
- return true;
+ return true;
+ default:
+ return false;
}
/* Extract the values from the vector CST into the permutation
@@ -45066,6 +45274,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ break;
default:
gcc_unreachable ();
}
@@ -45126,7 +45339,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2;
}
else
@@ -45134,7 +45347,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
Happily, that is even one insn shorter than even extraction. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
}
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb6372a..460cbff 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -300,6 +300,9 @@
(define_mode_iterator VI1_AVX2
[(V32QI "TARGET_AVX2") V16QI])
+(define_mode_iterator VI1_AVX512
+ [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI])
+
(define_mode_iterator VI2_AVX2
[(V16HI "TARGET_AVX2") V8HI])
@@ -9239,9 +9242,9 @@
(set_attr "mode" "TI")])
(define_expand "mul<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand")
- (mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand")
- (match_operand:VI1_AVX2 2 "register_operand")))]
+ [(set (match_operand:VI1_AVX512 0 "register_operand")
+ (mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand")
+ (match_operand:VI1_AVX512 2 "register_operand")))]
"TARGET_SSE2"
{
ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
@@ -10621,7 +10624,8 @@
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
(V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
- (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+ (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])
(define_expand "vec_perm<mode>"
[(match_operand:VEC_PERM_AVX2 0 "register_operand")
@@ -10642,7 +10646,8 @@
(V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
- (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+ (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])
(define_expand "vec_perm_const<mode>"
[(match_operand:VEC_PERM_CONST 0 "register_operand")
@@ -11006,8 +11011,8 @@
})
(define_insn "<sse2_avx2>_packsswb"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
- (vec_concat:VI1_AVX2
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x")
+ (vec_concat:VI1_AVX512
(ss_truncate:<ssehalfvecmode>
(match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
(ss_truncate:<ssehalfvecmode>
@@ -11040,8 +11045,8 @@
(set_attr "mode" "<sseinsnmode>")])
(define_insn "<sse2_avx2>_packuswb"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
- (vec_concat:VI1_AVX2
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x")
+ (vec_concat:VI1_AVX512
(us_truncate:<ssehalfvecmode>
(match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
(us_truncate:<ssehalfvecmode>
@@ -13559,21 +13564,21 @@
(set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
(set_attr "mode" "DI")])
-(define_insn "<ssse3_avx2>_pshufb<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
- (unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
- (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,v")
+ (unspec:VI1_AVX512
+ [(match_operand:VI1_AVX512 1 "register_operand" "0,v")
+ (match_operand:VI1_AVX512 2 "nonimmediate_operand" "xm,vm")]
UNSPEC_PSHUFB))]
- "TARGET_SSSE3"
+ "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
"@
pshufb\t{%2, %0|%0, %2}
- vpshufb\t{%2, %1, %0|%0, %1, %2}"
+ vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sselog1")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix_extra" "1")
- (set_attr "prefix" "orig,vex")
+ (set_attr "prefix" "orig,maybe_evex")
(set_attr "btver2_decode" "vector,vector")
(set_attr "mode" "<sseinsnmode>")])
@@ -15948,9 +15953,9 @@
(set_attr "mode" "TI")])
(define_expand "<shift_insn><mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand")
- (any_shift:VI1_AVX2
- (match_operand:VI1_AVX2 1 "register_operand")
+ [(set (match_operand:VI1_AVX512 0 "register_operand")
+ (any_shift:VI1_AVX512
+ (match_operand:VI1_AVX512 1 "register_operand")
(match_operand:SI 2 "nonmemory_operand")))]
"TARGET_SSE2"
{
--
1.8.3.1
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
2014-10-09 12:19 ` [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi) Ilya Tocar
@ 2014-10-09 18:51 ` Jakub Jelinek
2014-10-10 15:49 ` Ilya Tocar
0 siblings, 1 reply; 11+ messages in thread
From: Jakub Jelinek @ 2014-10-09 18:51 UTC (permalink / raw)
To: Ilya Tocar; +Cc: Kirill Yukhin, Uros Bizjak, Richard Henderson, GCC Patches
On Thu, Oct 09, 2014 at 04:15:23PM +0400, Ilya Tocar wrote:
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[])
> return true;
> }
>
> +/* AVX512F does support 64-byte integer vector operations,
> + thus the longest vector we are faced with is V64QImode. */
> +#define MAX_VECT_LEN 64
> +
> +struct expand_vec_perm_d
> +{
> + rtx target, op0, op1;
> + unsigned char perm[MAX_VECT_LEN];
> + enum machine_mode vmode;
> + unsigned char nelt;
> + bool one_operand_p;
> + bool testing_p;
> +};
> +
> static bool
> -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
> +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, struct expand_vec_perm_d *d)
Too long line, please wrap it.
> {
> - enum machine_mode mode = GET_MODE (op0);
> + enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
> +
> switch (mode)
> {
> + case V8HImode:
> + if (!TARGET_AVX512VL || !TARGET_AVX512BW)
> + return false;
> + break;
> + case V16HImode:
> + if (!TARGET_AVX512VL || !TARGET_AVX512BW)
> + return false;
> + case V32HImode:
> + if (!TARGET_AVX512BW)
> + return false;
> + break;
> + case V4SImode:
> + if (!TARGET_AVX512VL)
> + return false;
> + break;
> + case V8SImode:
> + if (!TARGET_AVX512VL)
> + return false;
> + break;
> + case V16SImode:
> + if (!TARGET_AVX512F)
> + return false;
> + break;
> + case V4SFmode:
> + if (!TARGET_AVX512VL)
> + return false;
> + break;
> + case V8SFmode:
> + if (!TARGET_AVX512VL)
> + return false;
> + break;
> + case V16SFmode:
> + if (!TARGET_AVX512F)
> + return false;
> + break;
> + case V2DImode:
> + if (!TARGET_AVX512VL)
> + return false;
> + break;
> + case V4DImode:
> + if (!TARGET_AVX512VL)
> + return false;
> + break;
> + case V8DImode:
> + if (!TARGET_AVX512F)
> + return false;
> + break;
> + case V2DFmode:
> + if (!TARGET_AVX512VL)
> + return false;
> + break;
> + case V4DFmode:
> + if (!TARGET_AVX512VL)
> + return false;
> + break;
> + case V8DFmode:
> + if (!TARGET_AVX512F)
> + return false;
> + break;
> + default:
> + return false;
> + }
> +
> + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const expander,
> + so args are either in d, or in op0, op1 etc. */
> + if (d)
> + {
> + rtx vec[64];
> + target = d->target;
> + op0 = d->op0;
> + op1 = d->op1;
> + for (int i = 0; i < d->nelt; ++i)
> + vec[i] = GEN_INT (d->perm[i]);
> + mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (d->nelt, vec));
Shouldn't the mask use integral vector mode rather than floating?
My strong preference would be:
enum machine_mode maskmode = mode;
rtx (*gen) (rtx, rtx, rtx, rtx);
right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
line and then inside of the first switch just do:
...
case V16SImode:
if (!TARGET_AVX512F)
return false;
gen = gen_avx512f_vpermi2varv16si3;
break;
case V4SFmode:
if (!TARGET_AVX512VL)
return false;
gen = gen_avx512vl_vpermi2varv4sf3;
maskmode = V4SImode;
break;
...
etc., then in the mask = line use:
mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
and finally instead of the second switch do:
emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
return true;
Otherwise, the patch LGTM, but will leave the final approval to Uros.
Jakub
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
2014-10-09 18:51 ` Jakub Jelinek
@ 2014-10-10 15:49 ` Ilya Tocar
2014-10-10 15:59 ` Jakub Jelinek
2014-10-10 16:39 ` Uros Bizjak
0 siblings, 2 replies; 11+ messages in thread
From: Ilya Tocar @ 2014-10-10 15:49 UTC (permalink / raw)
To: Jakub Jelinek; +Cc: Kirill Yukhin, Uros Bizjak, Richard Henderson, GCC Patches
On 09 Oct 20:51, Jakub Jelinek wrote:
> On Thu, Oct 09, 2014 at 04:15:23PM +0400, Ilya Tocar wrote:
> > --- a/gcc/config/i386/i386.c
> > +++ b/gcc/config/i386/i386.c
> > @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[])
> > return true;
> > }
> >
> > -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
> > +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, struct expand_vec_perm_d *d)
>
> Too long line, please wrap it.
>
Fixed.
> > {
> > - enum machine_mode mode = GET_MODE (op0);
> > + enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
> > +
> > switch (mode)
> > {
> > + case V8HImode:
> > + if (!TARGET_AVX512VL || !TARGET_AVX512BW)
> > + return false;
>
> My strong preference would be:
> enum machine_mode maskmode = mode;
> rtx (*gen) (rtx, rtx, rtx, rtx);
> right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
> line and then inside of the first switch just do:
> ...
> case V16SImode:
> if (!TARGET_AVX512F)
> return false;
> gen = gen_avx512f_vpermi2varv16si3;
> break;
> case V4SFmode:
> if (!TARGET_AVX512VL)
> return false;
> gen = gen_avx512vl_vpermi2varv4sf3;
> maskmode = V4SImode;
> break;
> ...
> etc., then in the mask = line use:
> mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
> and finally instead of the second switch do:
> emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
> return true;
>
Updated patch below.
---
gcc/config/i386/i386.c | 281 +++++++++++++++++++++++++++++++++++++++----------
gcc/config/i386/sse.md | 45 ++++----
2 files changed, 253 insertions(+), 73 deletions(-)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 352ab81..2247da8 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21358,33 +21358,132 @@ ix86_expand_int_vcond (rtx operands[])
return true;
}
+/* AVX512F does support 64-byte integer vector operations,
+ thus the longest vector we are faced with is V64QImode. */
+#define MAX_VECT_LEN 64
+
+struct expand_vec_perm_d
+{
+ rtx target, op0, op1;
+ unsigned char perm[MAX_VECT_LEN];
+ enum machine_mode vmode;
+ unsigned char nelt;
+ bool one_operand_p;
+ bool testing_p;
+};
+
static bool
-ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
+ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
+ struct expand_vec_perm_d *d)
{
- enum machine_mode mode = GET_MODE (op0);
+ /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc. */
+ enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
+ enum machine_mode maskmode = mode;
+ rtx (*gen) (rtx, rtx, rtx, rtx);
+
switch (mode)
{
+ case V8HImode:
+ if (!TARGET_AVX512VL || !TARGET_AVX512BW)
+ return false;
+ gen = gen_avx512vl_vpermi2varv8hi3;
+ break;
+ case V16HImode:
+ if (!TARGET_AVX512VL || !TARGET_AVX512BW)
+ return false;
+ gen = gen_avx512vl_vpermi2varv16hi3;
+ break;
+ case V32HImode:
+ if (!TARGET_AVX512BW)
+ return false;
+ gen = gen_avx512bw_vpermi2varv32hi3;
+ break;
+ case V4SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ gen = gen_avx512vl_vpermi2varv4si3;
+ break;
+ case V8SImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ gen = gen_avx512vl_vpermi2varv8si3;
+ break;
case V16SImode:
- emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
- force_reg (V16SImode, mask),
- op1));
- return true;
+ if (!TARGET_AVX512F)
+ return false;
+ gen = gen_avx512f_vpermi2varv16si3;
+ break;
+ case V4SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ gen = gen_avx512vl_vpermi2varv4sf3;
+ maskmode = V4SImode;
+ break;
+ case V8SFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ gen = gen_avx512vl_vpermi2varv8sf3;
+ maskmode = V8SImode;
+ break;
case V16SFmode:
- emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
- force_reg (V16SImode, mask),
- op1));
- return true;
+ if (!TARGET_AVX512F)
+ return false;
+ gen = gen_avx512f_vpermi2varv16sf3;
+ maskmode = V16SImode;
+ break;
+ case V2DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ gen = gen_avx512vl_vpermi2varv2di3;
+ break;
+ case V4DImode:
+ if (!TARGET_AVX512VL)
+ return false;
+ gen = gen_avx512vl_vpermi2varv4di3;
+ break;
case V8DImode:
- emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
- force_reg (V8DImode, mask), op1));
- return true;
+ if (!TARGET_AVX512F)
+ return false;
+ gen = gen_avx512f_vpermi2varv8di3;
+ break;
+ case V2DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ gen = gen_avx512vl_vpermi2varv2df3;
+ maskmode = V2DImode;
+ break;
+ case V4DFmode:
+ if (!TARGET_AVX512VL)
+ return false;
+ gen = gen_avx512vl_vpermi2varv4df3;
+ maskmode = V4DImode;
+ break;
case V8DFmode:
- emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
- force_reg (V8DImode, mask), op1));
- return true;
+ if (!TARGET_AVX512F)
+ return false;
+ gen = gen_avx512f_vpermi2varv8df3;
+ maskmode = V8DImode;
+ break;
default:
return false;
}
+
+ /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc. */
+ if (d)
+ {
+ rtx vec[64];
+ target = d->target;
+ op0 = d->op0;
+ op1 = d->op1;
+ for (int i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
+ }
+
+ emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
+ return true;
}
/* Expand a variable vector permutation. */
@@ -21407,7 +21506,7 @@ ix86_expand_vec_perm (rtx operands[])
e = GET_MODE_UNIT_SIZE (mode);
gcc_assert (w <= 64);
- if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1))
+ if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
return;
if (TARGET_AVX2)
@@ -21779,6 +21878,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
switch (imode)
{
+ case V64QImode:
+ if (unsigned_p)
+ unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+ else
+ unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+ halfmode = V32QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+ break;
case V32QImode:
if (unsigned_p)
unpack = gen_avx2_zero_extendv16qiv16hi2;
@@ -39603,20 +39711,6 @@ x86_emit_floatuns (rtx operands[2])
emit_label (donelab);
}
\f
-/* AVX512F does support 64-byte integer vector operations,
- thus the longest vector we are faced with is V64QImode. */
-#define MAX_VECT_LEN 64
-
-struct expand_vec_perm_d
-{
- rtx target, op0, op1;
- unsigned char perm[MAX_VECT_LEN];
- enum machine_mode vmode;
- unsigned char nelt;
- bool one_operand_p;
- bool testing_p;
-};
-
static bool canonicalize_perm (struct expand_vec_perm_d *d);
static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
@@ -42662,7 +42756,10 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
+ && GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
+ ;
+ else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
@@ -42693,12 +42790,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
switch (vmode)
{
+ case V8DFmode:
+ case V16SFmode:
case V4DFmode:
case V8SFmode:
case V2DFmode:
case V4SFmode:
case V8HImode:
case V8SImode:
+ case V32HImode:
+ case V64QImode:
+ case V16SImode:
+ case V8DImode:
for (i = 0; i < nelt; ++i)
mask |= (d->perm[i] >= nelt) << i;
break;
@@ -42921,9 +43024,9 @@ static bool
expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
{
unsigned i, nelt, eltsz, mask;
- unsigned char perm[32];
+ unsigned char perm[64];
enum machine_mode vmode = V16QImode;
- rtx rperm[32], vperm, target, op0, op1;
+ rtx rperm[64], vperm, target, op0, op1;
nelt = d->nelt;
@@ -43012,6 +43115,19 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return false;
}
}
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+ if (vmode == V64QImode)
+ {
+ /* vpshufb only works intra lanes, it is not
+ possible to shuffle bytes in between the lanes. */
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
+ }
+ }
else
return false;
}
@@ -43029,6 +43145,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
mask = 2 * nelt - 1;
else if (vmode == V16QImode)
mask = nelt - 1;
+ else if (vmode == V64QImode)
+ mask = nelt / 4 - 1;
else
mask = nelt / 2 - 1;
@@ -43054,6 +43172,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else if (vmode == V64QImode)
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
else if (vmode == V8SFmode)
emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
else
@@ -43109,12 +43229,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
+ case V64QImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vec_dupv64qi;
+ break;
case V32QImode:
gen = gen_avx2_pbroadcastv32qi_1;
break;
+ case V32HImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vec_dupv32hi;
+ break;
case V16HImode:
gen = gen_avx2_pbroadcastv16hi_1;
break;
+ case V16SImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16si;
+ break;
case V8SImode:
gen = gen_avx2_pbroadcastv8si_1;
break;
@@ -43124,9 +43256,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
case V8HImode:
gen = gen_avx2_pbroadcastv8hi;
break;
+ case V16SFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16sf;
+ break;
case V8SFmode:
gen = gen_avx2_vec_dupv8sf_1;
break;
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8df;
+ break;
+ case V8DImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8di;
+ break;
/* For other modes prefer other shuffles this function creates. */
default: break;
}
@@ -43210,16 +43354,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
return true;
/* Try the AVX512F vpermi2 instructions. */
- rtx vec[64];
- enum machine_mode mode = d->vmode;
- if (mode == V8DFmode)
- mode = V8DImode;
- else if (mode == V16SFmode)
- mode = V16SImode;
- for (i = 0; i < nelt; ++i)
- vec[i] = GEN_INT (d->perm[i]);
- rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
- if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1))
+ if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
return true;
return false;
@@ -44932,21 +45067,56 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
/* Given sufficient ISA support we can just return true here
for selected vector modes. */
- if (d.vmode == V16SImode || d.vmode == V16SFmode
- || d.vmode == V8DFmode || d.vmode == V8DImode)
- /* All implementable with a single vpermi2 insn. */
- return true;
- if (GET_MODE_SIZE (d.vmode) == 16)
+ switch (d.vmode)
{
+ case V16SFmode:
+ case V16SImode:
+ case V8DImode:
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V32HImode:
+ if (TARGET_AVX512BW)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V8SImode:
+ case V8SFmode:
+ case V4DFmode:
+ case V4DImode:
+ if (TARGET_AVX512VL)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V16HImode:
+ if (TARGET_AVX2)
+ /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
+ return true;
+ break;
+ case V32QImode:
+ if (TARGET_AVX2)
+ /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
+ return true;
+ break;
+ case V4SImode:
+ case V4SFmode:
+ case V8HImode:
+ case V16QImode:
/* All implementable with a single vpperm insn. */
if (TARGET_XOP)
return true;
/* All implementable with 2 pshufb + 1 ior. */
if (TARGET_SSSE3)
return true;
+ break;
+ case V2DImode:
+ case V2DFmode:
/* All implementable with shufpd or unpck[lh]pd. */
- if (d.nelt == 2)
- return true;
+ return true;
+ default:
+ return false;
}
/* Extract the values from the vector CST into the permutation
@@ -45066,6 +45236,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ break;
default:
gcc_unreachable ();
}
@@ -45126,7 +45301,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2;
}
else
@@ -45134,7 +45309,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
Happily, that is even one insn shorter than even extraction. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
}
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index bb6372a..460cbff 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -300,6 +300,9 @@
(define_mode_iterator VI1_AVX2
[(V32QI "TARGET_AVX2") V16QI])
+(define_mode_iterator VI1_AVX512
+ [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI])
+
(define_mode_iterator VI2_AVX2
[(V16HI "TARGET_AVX2") V8HI])
@@ -9239,9 +9242,9 @@
(set_attr "mode" "TI")])
(define_expand "mul<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand")
- (mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand")
- (match_operand:VI1_AVX2 2 "register_operand")))]
+ [(set (match_operand:VI1_AVX512 0 "register_operand")
+ (mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand")
+ (match_operand:VI1_AVX512 2 "register_operand")))]
"TARGET_SSE2"
{
ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
@@ -10621,7 +10624,8 @@
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
(V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
- (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+ (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])
(define_expand "vec_perm<mode>"
[(match_operand:VEC_PERM_AVX2 0 "register_operand")
@@ -10642,7 +10646,8 @@
(V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
- (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+ (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])
(define_expand "vec_perm_const<mode>"
[(match_operand:VEC_PERM_CONST 0 "register_operand")
@@ -11006,8 +11011,8 @@
})
(define_insn "<sse2_avx2>_packsswb"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
- (vec_concat:VI1_AVX2
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x")
+ (vec_concat:VI1_AVX512
(ss_truncate:<ssehalfvecmode>
(match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
(ss_truncate:<ssehalfvecmode>
@@ -11040,8 +11045,8 @@
(set_attr "mode" "<sseinsnmode>")])
(define_insn "<sse2_avx2>_packuswb"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
- (vec_concat:VI1_AVX2
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x")
+ (vec_concat:VI1_AVX512
(us_truncate:<ssehalfvecmode>
(match_operand:<sseunpackmode> 1 "register_operand" "0,x"))
(us_truncate:<ssehalfvecmode>
@@ -13559,21 +13564,21 @@
(set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
(set_attr "mode" "DI")])
-(define_insn "<ssse3_avx2>_pshufb<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
- (unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
- (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,v")
+ (unspec:VI1_AVX512
+ [(match_operand:VI1_AVX512 1 "register_operand" "0,v")
+ (match_operand:VI1_AVX512 2 "nonimmediate_operand" "xm,vm")]
UNSPEC_PSHUFB))]
- "TARGET_SSSE3"
+ "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
"@
pshufb\t{%2, %0|%0, %2}
- vpshufb\t{%2, %1, %0|%0, %1, %2}"
+ vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sselog1")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix_extra" "1")
- (set_attr "prefix" "orig,vex")
+ (set_attr "prefix" "orig,maybe_evex")
(set_attr "btver2_decode" "vector,vector")
(set_attr "mode" "<sseinsnmode>")])
@@ -15948,9 +15953,9 @@
(set_attr "mode" "TI")])
(define_expand "<shift_insn><mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand")
- (any_shift:VI1_AVX2
- (match_operand:VI1_AVX2 1 "register_operand")
+ [(set (match_operand:VI1_AVX512 0 "register_operand")
+ (any_shift:VI1_AVX512
+ (match_operand:VI1_AVX512 1 "register_operand")
(match_operand:SI 2 "nonmemory_operand")))]
"TARGET_SSE2"
{
--
1.8.3.1
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
2014-10-10 15:49 ` Ilya Tocar
@ 2014-10-10 15:59 ` Jakub Jelinek
2014-10-10 16:39 ` Uros Bizjak
1 sibling, 0 replies; 11+ messages in thread
From: Jakub Jelinek @ 2014-10-10 15:59 UTC (permalink / raw)
To: Ilya Tocar; +Cc: Kirill Yukhin, Uros Bizjak, Richard Henderson, GCC Patches
On Fri, Oct 10, 2014 at 07:47:19PM +0400, Ilya Tocar wrote:
> Updated patch below.
You haven't posted ChangeLog entry this time, so using the last one:
* config/i386/i386.c
(MAX_VECT_LEN): Move above ix86_expand_vec_perm_vpermi2.
...
* config/i386/sse.md
(define_mode_iterator VI1_AVX512): New.
I'd think you should avoid the line break after filename in these
cases, so
* config/i386/i386.c (MAX_VECT_LEN): Move above
ix86_expand_vec_perm_vpermi2.
...
* config/i386/sse.md (define_mode_iterator VI1_AVX512): New.
Other than that nit it looks good to me.
Jakub
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
2014-10-10 15:49 ` Ilya Tocar
2014-10-10 15:59 ` Jakub Jelinek
@ 2014-10-10 16:39 ` Uros Bizjak
2014-10-16 10:25 ` Ilya Tocar
1 sibling, 1 reply; 11+ messages in thread
From: Uros Bizjak @ 2014-10-10 16:39 UTC (permalink / raw)
To: Ilya Tocar; +Cc: Jakub Jelinek, Kirill Yukhin, Richard Henderson, GCC Patches
On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar <tocarip.intel@gmail.com> wrote:
>> My strong preference would be:
>> enum machine_mode maskmode = mode;
>> rtx (*gen) (rtx, rtx, rtx, rtx);
>> right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
>> line and then inside of the first switch just do:
>> ...
>> case V16SImode:
>> if (!TARGET_AVX512F)
>> return false;
>> gen = gen_avx512f_vpermi2varv16si3;
>> break;
>> case V4SFmode:
>> if (!TARGET_AVX512VL)
>> return false;
>> gen = gen_avx512vl_vpermi2varv4sf3;
>> maskmode = V4SImode;
>> break;
>> ...
>> etc., then in the mask = line use:
>> mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
>> and finally instead of the second switch do:
>> emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
>> return true;
>>
> Updated patch below.
Please recode that horrible first switch statement to:
--cut here--
rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
switch (mode)
{
case V8HImode:
if (TARGET_AVX512VL && TARGET_AVX152BW)
gen = gen_avx512vl_vpermi2varv8hi3;
break;
...
case V2DFmode:
if (TARGET_AVX512VL)
{
gen = gen_avx512vl_vpermi2varv2df3;
maskmode = V2DImode;
}
break;
default:
break;
}
if (gen == NULL)
return false;
--cut here--
The patch is OK with the above improvement.
(Please also note that the patch has a bunch of i386.md changes that
will clash with followup patch series).
Thanks,
Uros.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
2014-10-10 16:39 ` Uros Bizjak
@ 2014-10-16 10:25 ` Ilya Tocar
2014-10-16 11:18 ` Jakub Jelinek
2014-10-20 15:20 ` Ilya Tocar
0 siblings, 2 replies; 11+ messages in thread
From: Ilya Tocar @ 2014-10-16 10:25 UTC (permalink / raw)
To: Uros Bizjak; +Cc: Jakub Jelinek, Kirill Yukhin, Richard Henderson, GCC Patches
On 10 Oct 18:37, Uros Bizjak wrote:
> On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar <tocarip.intel@gmail.com> wrote:
>
>
> Please recode that horrible first switch statement to:
>
> --cut here--
> rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
>
> switch (mode)
> {
> case V8HImode:
> if (TARGET_AVX512VL && TARGET_AVX152BW)
> gen = gen_avx512vl_vpermi2varv8hi3;
> break;
>
> ...
>
> case V2DFmode:
> if (TARGET_AVX512VL)
> {
> gen = gen_avx512vl_vpermi2varv2df3;
> maskmode = V2DImode;
>
> The patch is OK with the above improvement.
>
> Thanks,
> Uros.
>
Will commit version below, if no objections in 24 hours.
---
gcc/config/i386/i386.c | 292 ++++++++++++++++++++++++++++++++++++++-----------
gcc/config/i386/sse.md | 45 ++++----
2 files changed, 255 insertions(+), 82 deletions(-)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index aedac19..e1228e3 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -21411,35 +21411,132 @@ ix86_expand_int_vcond (rtx operands[])
return true;
}
+/* AVX512F does support 64-byte integer vector operations,
+ thus the longest vector we are faced with is V64QImode. */
+#define MAX_VECT_LEN 64
+
+struct expand_vec_perm_d
+{
+ rtx target, op0, op1;
+ unsigned char perm[MAX_VECT_LEN];
+ enum machine_mode vmode;
+ unsigned char nelt;
+ bool one_operand_p;
+ bool testing_p;
+};
+
static bool
-ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1)
+ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1,
+ struct expand_vec_perm_d *d)
{
- enum machine_mode mode = GET_MODE (op0);
+ /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc. */
+ enum machine_mode mode = GET_MODE (d ? d->op0 : op0);
+ enum machine_mode maskmode = mode;
+ rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
+
switch (mode)
{
+ case V8HImode:
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen = gen_avx512vl_vpermi2varv8hi3;
+ break;
+ case V16HImode:
+ if (TARGET_AVX512VL && TARGET_AVX512BW)
+ gen = gen_avx512vl_vpermi2varv16hi3;
+ break;
+ case V32HImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vpermi2varv32hi3;
+ break;
+ case V4SImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512vl_vpermi2varv4si3;
+ break;
+ case V8SImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512vl_vpermi2varv8si3;
+ break;
case V16SImode:
- emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0,
- force_reg (V16SImode, mask),
- op1));
- return true;
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vpermi2varv16si3;
+ break;
+ case V4SFmode:
+ if (TARGET_AVX512VL)
+ {
+ gen = gen_avx512vl_vpermi2varv4sf3;
+ maskmode = V4SImode;
+ }
+ break;
+ case V8SFmode:
+ if (TARGET_AVX512VL)
+ {
+ gen = gen_avx512vl_vpermi2varv8sf3;
+ maskmode = V8SImode;
+ }
+ break;
case V16SFmode:
- emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0,
- force_reg (V16SImode, mask),
- op1));
- return true;
+ if (TARGET_AVX512F)
+ {
+ gen = gen_avx512f_vpermi2varv16sf3;
+ maskmode = V16SImode;
+ }
+ break;
+ case V2DImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512vl_vpermi2varv2di3;
+ break;
+ case V4DImode:
+ if (TARGET_AVX512VL)
+ gen = gen_avx512vl_vpermi2varv4di3;
+ break;
case V8DImode:
- emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0,
- force_reg (V8DImode, mask),
- op1));
- return true;
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vpermi2varv8di3;
+ break;
+ case V2DFmode:
+ if (TARGET_AVX512VL)
+ {
+ gen = gen_avx512vl_vpermi2varv2df3;
+ maskmode = V2DImode;
+ }
+ break;
+ case V4DFmode:
+ if (TARGET_AVX512VL)
+ {
+ gen = gen_avx512vl_vpermi2varv4df3;
+ maskmode = V4DImode;
+ }
+ break;
case V8DFmode:
- emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0,
- force_reg (V8DImode, mask),
- op1));
- return true;
+ if (TARGET_AVX512F)
+ {
+ gen = gen_avx512f_vpermi2varv8df3;
+ maskmode = V8DImode;
+ }
+ break;
default:
- return false;
+ break;
}
+
+ if (gen == NULL)
+ return false;
+
+ /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const
+ expander, so args are either in d, or in op0, op1 etc. */
+ if (d)
+ {
+ rtx vec[64];
+ target = d->target;
+ op0 = d->op0;
+ op1 = d->op1;
+ for (int i = 0; i < d->nelt; ++i)
+ vec[i] = GEN_INT (d->perm[i]);
+ mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
+ }
+
+ emit_insn (gen (target, op0, force_reg (maskmode, mask), op1));
+ return true;
}
/* Expand a variable vector permutation. */
@@ -21462,8 +21559,7 @@ ix86_expand_vec_perm (rtx operands[])
e = GET_MODE_UNIT_SIZE (mode);
gcc_assert (w <= 64);
- if (TARGET_AVX512F
- && ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1))
+ if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL))
return;
if (TARGET_AVX2)
@@ -21835,6 +21931,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
switch (imode)
{
+ case V64QImode:
+ if (unsigned_p)
+ unpack = gen_avx512bw_zero_extendv32qiv32hi2;
+ else
+ unpack = gen_avx512bw_sign_extendv32qiv32hi2;
+ halfmode = V32QImode;
+ extract
+ = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
+ break;
case V32QImode:
if (unsigned_p)
unpack = gen_avx2_zero_extendv16qiv16hi2;
@@ -39683,20 +39788,6 @@ x86_emit_floatuns (rtx operands[2])
emit_label (donelab);
}
\f
-/* AVX512F does support 64-byte integer vector operations,
- thus the longest vector we are faced with is V64QImode. */
-#define MAX_VECT_LEN 64
-
-struct expand_vec_perm_d
-{
- rtx target, op0, op1;
- unsigned char perm[MAX_VECT_LEN];
- enum machine_mode vmode;
- unsigned char nelt;
- bool one_operand_p;
- bool testing_p;
-};
-
static bool canonicalize_perm (struct expand_vec_perm_d *d);
static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
@@ -42745,7 +42836,10 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
if (d->one_operand_p)
return false;
- if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
+ && GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4)
+ ;
+ else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
@@ -42776,12 +42870,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
switch (vmode)
{
+ case V8DFmode:
+ case V16SFmode:
case V4DFmode:
case V8SFmode:
case V2DFmode:
case V4SFmode:
case V8HImode:
case V8SImode:
+ case V32HImode:
+ case V64QImode:
+ case V16SImode:
+ case V8DImode:
for (i = 0; i < nelt; ++i)
mask |= (d->perm[i] >= nelt) << i;
break;
@@ -43004,9 +43104,9 @@ static bool
expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
{
unsigned i, nelt, eltsz, mask;
- unsigned char perm[32];
+ unsigned char perm[64];
enum machine_mode vmode = V16QImode;
- rtx rperm[32], vperm, target, op0, op1;
+ rtx rperm[64], vperm, target, op0, op1;
nelt = d->nelt;
@@ -43095,6 +43195,19 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
return false;
}
}
+ else if (GET_MODE_SIZE (d->vmode) == 64)
+ {
+ if (!TARGET_AVX512BW)
+ return false;
+ if (vmode == V64QImode)
+ {
+ /* vpshufb only works intra lanes, it is not
+ possible to shuffle bytes in between the lanes. */
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 4))
+ return false;
+ }
+ }
else
return false;
}
@@ -43112,6 +43225,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
mask = 2 * nelt - 1;
else if (vmode == V16QImode)
mask = nelt - 1;
+ else if (vmode == V64QImode)
+ mask = nelt / 4 - 1;
else
mask = nelt / 2 - 1;
@@ -43137,6 +43252,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ else if (vmode == V64QImode)
+ emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
else if (vmode == V8SFmode)
emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
else
@@ -43192,12 +43309,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
rtx (*gen) (rtx, rtx) = NULL;
switch (d->vmode)
{
+ case V64QImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vec_dupv64qi;
+ break;
case V32QImode:
gen = gen_avx2_pbroadcastv32qi_1;
break;
+ case V32HImode:
+ if (TARGET_AVX512BW)
+ gen = gen_avx512bw_vec_dupv32hi;
+ break;
case V16HImode:
gen = gen_avx2_pbroadcastv16hi_1;
break;
+ case V16SImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16si;
+ break;
case V8SImode:
gen = gen_avx2_pbroadcastv8si_1;
break;
@@ -43207,9 +43336,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
case V8HImode:
gen = gen_avx2_pbroadcastv8hi;
break;
+ case V16SFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv16sf;
+ break;
case V8SFmode:
gen = gen_avx2_vec_dupv8sf_1;
break;
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8df;
+ break;
+ case V8DImode:
+ if (TARGET_AVX512F)
+ gen = gen_avx512f_vec_dupv8di;
+ break;
/* For other modes prefer other shuffles this function creates. */
default: break;
}
@@ -43294,23 +43435,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
/* Try the AVX2 vpalignr instruction. */
if (expand_vec_perm_palignr (d, true))
- return true;
/* Try the AVX512F vpermi2 instructions. */
- if (TARGET_AVX512F)
- {
- rtx vec[64];
- enum machine_mode mode = d->vmode;
- if (mode == V8DFmode)
- mode = V8DImode;
- else if (mode == V16SFmode)
- mode = V16SImode;
- for (i = 0; i < nelt; ++i)
- vec[i] = GEN_INT (d->perm[i]);
- rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec));
- if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1))
- return true;
- }
+ if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
+ return true;
return false;
}
@@ -45097,21 +45225,56 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode,
/* Given sufficient ISA support we can just return true here
for selected vector modes. */
- if (d.vmode == V16SImode || d.vmode == V16SFmode
- || d.vmode == V8DFmode || d.vmode == V8DImode)
- /* All implementable with a single vpermi2 insn. */
- return true;
- if (GET_MODE_SIZE (d.vmode) == 16)
+ switch (d.vmode)
{
+ case V16SFmode:
+ case V16SImode:
+ case V8DImode:
+ case V8DFmode:
+ if (TARGET_AVX512F)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V32HImode:
+ if (TARGET_AVX512BW)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V8SImode:
+ case V8SFmode:
+ case V4DFmode:
+ case V4DImode:
+ if (TARGET_AVX512VL)
+ /* All implementable with a single vpermi2 insn. */
+ return true;
+ break;
+ case V16HImode:
+ if (TARGET_AVX2)
+ /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
+ return true;
+ break;
+ case V32QImode:
+ if (TARGET_AVX2)
+ /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
+ return true;
+ break;
+ case V4SImode:
+ case V4SFmode:
+ case V8HImode:
+ case V16QImode:
/* All implementable with a single vpperm insn. */
if (TARGET_XOP)
return true;
/* All implementable with 2 pshufb + 1 ior. */
if (TARGET_SSSE3)
return true;
+ break;
+ case V2DImode:
+ case V2DFmode:
/* All implementable with shufpd or unpck[lh]pd. */
- if (d.nelt == 2)
- return true;
+ return true;
+ default:
+ return false;
}
/* Extract the values from the vector CST into the permutation
@@ -45231,6 +45394,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
gen_il = gen_avx2_interleave_lowv32qi;
gen_ih = gen_avx2_interleave_highv32qi;
break;
+ case V64QImode:
+ himode = V32HImode;
+ gen_il = gen_avx512bw_interleave_lowv64qi;
+ gen_ih = gen_avx512bw_interleave_highv64qi;
+ break;
default:
gcc_unreachable ();
}
@@ -45291,7 +45459,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
/* For SSE2, we used an full interleave, so the desired
results are in the even elements. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2;
}
else
@@ -45299,7 +45467,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
/* For AVX, the interleave used above was not cross-lane. So the
extraction is evens but with the second and third quarter swapped.
Happily, that is even one insn shorter than even extraction. */
- for (i = 0; i < 32; ++i)
+ for (i = 0; i < 64; ++i)
d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0);
}
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a6cf363..d78194f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -301,6 +301,9 @@
(define_mode_iterator VI1_AVX2
[(V32QI "TARGET_AVX2") V16QI])
+(define_mode_iterator VI1_AVX512
+ [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI])
+
(define_mode_iterator VI2_AVX2
[(V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI])
@@ -9237,9 +9240,9 @@
(set_attr "mode" "TI")])
(define_expand "mul<mode>3<mask_name>"
- [(set (match_operand:VI1_AVX2 0 "register_operand")
- (mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand")
- (match_operand:VI1_AVX2 2 "register_operand")))]
+ [(set (match_operand:VI1_AVX512 0 "register_operand")
+ (mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand")
+ (match_operand:VI1_AVX512 2 "register_operand")))]
"TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
{
ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]);
@@ -10643,7 +10646,8 @@
(V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2")
(V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")
(V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
- (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")])
+ (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])
(define_expand "vec_perm<mode>"
[(match_operand:VEC_PERM_AVX2 0 "register_operand")
@@ -10664,7 +10668,8 @@
(V8SI "TARGET_AVX") (V4DI "TARGET_AVX")
(V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2")
(V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")
- (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")])
+ (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")
+ (V32HI "TARGET_AVX512BW")])
(define_expand "vec_perm_const<mode>"
[(match_operand:VEC_PERM_CONST 0 "register_operand")
@@ -11028,8 +11033,8 @@
})
(define_insn "<sse2_avx2>_packsswb<mask_name>"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v")
- (vec_concat:VI1_AVX2
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x")
+ (vec_concat:VI1_AVX512
(ss_truncate:<ssehalfvecmode>
(match_operand:<sseunpackmode> 1 "register_operand" "0,v"))
(ss_truncate:<ssehalfvecmode>
@@ -11062,8 +11067,8 @@
(set_attr "mode" "<sseinsnmode>")])
(define_insn "<sse2_avx2>_packuswb<mask_name>"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v")
- (vec_concat:VI1_AVX2
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x")
+ (vec_concat:VI1_AVX512
(us_truncate:<ssehalfvecmode>
(match_operand:<sseunpackmode> 1 "register_operand" "0,v"))
(us_truncate:<ssehalfvecmode>
@@ -13641,21 +13646,21 @@
(set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
(set_attr "mode" "DI")])
-(define_insn "<ssse3_avx2>_pshufb<mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x")
- (unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "0,x")
- (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")]
+(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,v")
+ (unspec:VI1_AVX512
+ [(match_operand:VI1_AVX512 1 "register_operand" "0,v")
+ (match_operand:VI1_AVX512 2 "nonimmediate_operand" "xm,vm")]
UNSPEC_PSHUFB))]
- "TARGET_SSSE3"
+ "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>"
"@
pshufb\t{%2, %0|%0, %2}
- vpshufb\t{%2, %1, %0|%0, %1, %2}"
+ vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}"
[(set_attr "isa" "noavx,avx")
(set_attr "type" "sselog1")
(set_attr "prefix_data16" "1,*")
(set_attr "prefix_extra" "1")
- (set_attr "prefix" "orig,vex")
+ (set_attr "prefix" "orig,maybe_evex")
(set_attr "btver2_decode" "vector,vector")
(set_attr "mode" "<sseinsnmode>")])
@@ -16038,9 +16043,9 @@
(set_attr "mode" "TI")])
(define_expand "<shift_insn><mode>3"
- [(set (match_operand:VI1_AVX2 0 "register_operand")
- (any_shift:VI1_AVX2
- (match_operand:VI1_AVX2 1 "register_operand")
+ [(set (match_operand:VI1_AVX512 0 "register_operand")
+ (any_shift:VI1_AVX512
+ (match_operand:VI1_AVX512 1 "register_operand")
(match_operand:SI 2 "nonmemory_operand")))]
"TARGET_SSE2"
{
--
1.8.3.1
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
2014-10-16 10:25 ` Ilya Tocar
@ 2014-10-16 11:18 ` Jakub Jelinek
2014-10-20 15:20 ` Ilya Tocar
1 sibling, 0 replies; 11+ messages in thread
From: Jakub Jelinek @ 2014-10-16 11:18 UTC (permalink / raw)
To: Ilya Tocar; +Cc: Uros Bizjak, Kirill Yukhin, Richard Henderson, GCC Patches
On Thu, Oct 16, 2014 at 02:23:16PM +0400, Ilya Tocar wrote:
> On 10 Oct 18:37, Uros Bizjak wrote:
> > On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar <tocarip.intel@gmail.com> wrote:
> >
> >
> > Please recode that horrible first switch statement to:
> >
> > --cut here--
> > rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
> >
> > switch (mode)
> > {
> > case V8HImode:
> > if (TARGET_AVX512VL && TARGET_AVX152BW)
> > gen = gen_avx512vl_vpermi2varv8hi3;
> > break;
> >
> > ...
> >
> > case V2DFmode:
> > if (TARGET_AVX512VL)
> > {
> > gen = gen_avx512vl_vpermi2varv2df3;
> > maskmode = V2DImode;
> >
> > The patch is OK with the above improvement.
> >
> > Thanks,
> > Uros.
> >
>
> Will commit version below, if no objections in 24 hours.
No need to wait, it is ok now (with proper ChangeLog of course).
Jakub
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
2014-10-16 10:25 ` Ilya Tocar
2014-10-16 11:18 ` Jakub Jelinek
@ 2014-10-20 15:20 ` Ilya Tocar
2014-10-20 16:59 ` Uros Bizjak
1 sibling, 1 reply; 11+ messages in thread
From: Ilya Tocar @ 2014-10-20 15:20 UTC (permalink / raw)
To: Uros Bizjak; +Cc: Jakub Jelinek, Kirill Yukhin, Richard Henderson, GCC Patches
> >
> > The patch is OK with the above improvement.
> >
> >
>
> Will commit version below, if no objections in 24 hours.
>
>
Sorry,
I've missed palignr, which should also have v64qi version,
and lost return in expand_vec_perm_palignr case
(this caused avx512f-vec-unpack test failures).
Patch below fixes it. Ok for trunk?
2014-10-20 Ilya Tocar <ilya.tocar@intel.com>
* config/i386/i386.c (expand_vec_perm_1): Fix
expand_vec_perm_palignr case.
* config/i386/sse.md (<ssse3_avx2>_palignr<mode>_mask): Use
VI1_AVX512.
---
gcc/config/i386/i386.c | 1 +
gcc/config/i386/sse.md | 12 ++++++------
2 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 33b21f4..34273ca 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -43552,6 +43552,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
/* Try the AVX2 vpalignr instruction. */
if (expand_vec_perm_palignr (d, true))
+ return true;
/* Try the AVX512F vpermi2 instructions. */
if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 8157045..a3f336f 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -13716,14 +13716,14 @@
(set_attr "mode" "DI")])
(define_insn "<ssse3_avx2>_palignr<mode>_mask"
- [(set (match_operand:VI1_AVX2 0 "register_operand" "=v")
- (vec_merge:VI1_AVX2
- (unspec:VI1_AVX2
- [(match_operand:VI1_AVX2 1 "register_operand" "v")
- (match_operand:VI1_AVX2 2 "nonimmediate_operand" "vm")
+ [(set (match_operand:VI1_AVX512 0 "register_operand" "=v")
+ (vec_merge:VI1_AVX512
+ (unspec:VI1_AVX512
+ [(match_operand:VI1_AVX512 1 "register_operand" "v")
+ (match_operand:VI1_AVX512 2 "nonimmediate_operand" "vm")
(match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")]
UNSPEC_PALIGNR)
- (match_operand:VI1_AVX2 4 "vector_move_operand" "0C")
+ (match_operand:VI1_AVX512 4 "vector_move_operand" "0C")
(match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))]
"TARGET_AVX512BW && (<MODE_SIZE> == 64 || TARGET_AVX512VL)"
{
--
1.8.3.1
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi).
2014-10-20 15:20 ` Ilya Tocar
@ 2014-10-20 16:59 ` Uros Bizjak
0 siblings, 0 replies; 11+ messages in thread
From: Uros Bizjak @ 2014-10-20 16:59 UTC (permalink / raw)
To: Ilya Tocar; +Cc: Jakub Jelinek, Kirill Yukhin, Richard Henderson, GCC Patches
On Mon, Oct 20, 2014 at 5:19 PM, Ilya Tocar <tocarip.intel@gmail.com> wrote:
>> >
>> > The patch is OK with the above improvement.
>> >
>> >
>>
>> Will commit version below, if no objections in 24 hours.
>>
>>
> Sorry,
> I've missed palignr, which should also have v64qi version,
> and lost return in expand_vec_perm_palignr case
> (this caused avx512f-vec-unpack test failures).
> Patch below fixes it. Ok for trunk?
>
> 2014-10-20 Ilya Tocar <ilya.tocar@intel.com>
>
> * config/i386/i386.c (expand_vec_perm_1): Fix
> expand_vec_perm_palignr case.
> * config/i386/sse.md (<ssse3_avx2>_palignr<mode>_mask): Use
> VI1_AVX512.
OK.
Thanks,
Uros.
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2014-10-20 16:08 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-06 12:55 [PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen Kirill Yukhin
2014-10-06 14:10 ` Jakub Jelinek
2014-10-09 12:19 ` [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi) Ilya Tocar
2014-10-09 18:51 ` Jakub Jelinek
2014-10-10 15:49 ` Ilya Tocar
2014-10-10 15:59 ` Jakub Jelinek
2014-10-10 16:39 ` Uros Bizjak
2014-10-16 10:25 ` Ilya Tocar
2014-10-16 11:18 ` Jakub Jelinek
2014-10-20 15:20 ` Ilya Tocar
2014-10-20 16:59 ` Uros Bizjak
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).