* [PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen. @ 2014-10-06 12:55 Kirill Yukhin 2014-10-06 14:10 ` Jakub Jelinek 0 siblings, 1 reply; 11+ messages in thread From: Kirill Yukhin @ 2014-10-06 12:55 UTC (permalink / raw) To: Uros Bizjak; +Cc: Jakub Jelinek, Richard Henderson, GCC Patches, kirill.yukhin Hello, This patch extends permutations for AVX-512*. Comments are welcome! Bootstrapped. AVX-512* tests on top of patch-set all pass under simulator. Is it ok for trunk? gcc/ * config/i386/i386.c (ix86_expand_vec_perm_vpermi2): Handle V64QImode, V8HImode, V16HImode, V32HImode, V4SImode, V8SImode, V4SFmode, V8SFmode, V2DImode, V4DImode, V2DFmode, V4DFmode. (ix86_expand_sse_unpack): Handle V64QImode. (expand_vec_perm_blend): Update conditions for TARGET, handle V8DFmode, V16SFmode, V32HImode, V64QImode, V16SImode, V8DImode. (expand_vec_perm_pshufb): Handle V64QImode. (expand_vec_perm_1): Handle V64QImode, V32HImode, V16SImode, V16SFmode, V8DFmode, V8DImode, V4DFmode, V2DFmode, V8SFmode, V4SFmode. (ix86_expand_vec_perm_const_1): Call ix86_expand_vec_perm_vpermi2. (ix86_vectorize_vec_perm_const_ok): Handle V32HImode, V64QImode. (ix86_expand_vecop_qihi): Handle V64QImode. * config/i386/sse.md (define_mode_iterator VI1_AVX2): Add V64QI mode. (define_mode_iterator VEC_PERM_AVX2): Add V32HI mode. (define_mode_iterator VEC_PERM_CONST): Add V64QI and V32HI mode. (define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"): Add masking. -- Thanks, K diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 352ab81..d759a45 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) enum machine_mode mode = GET_MODE (op0); switch (mode) { + /* There is no byte version of vpermi2. So we use vpermi2w. */ + case V64QImode: + if (!TARGET_AVX512BW) + return false; + rtx mask_lowpart, op0_lowpart, op1_lowpart; + rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi; + + mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask)); + op0_lowpart = gen_lowpart (V32HImode, op0); + op1_lowpart = gen_lowpart (V32HImode, op1); + tmp = gen_reg_rtx (V32HImode); + tmp2 = gen_reg_rtx (V32HImode); + perm_lo = gen_reg_rtx (V32HImode); + perm_hi = gen_reg_rtx (V32HImode); + res_lo = gen_reg_rtx (V32HImode); + res_hi = gen_reg_rtx (V32HImode); + + emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8))); + emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9))); + emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9))); + emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart, + perm_lo, op1_lowpart)); + emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart, + perm_hi, op1_lowpart)); + emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8))); + emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo), + gen_lowpart (V64QImode, res_hi), + force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL)))); + return true; + case V8HImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0, + force_reg (V8HImode, mask), op1)); + return true; + case V16HImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0, + force_reg (V16HImode, mask), op1)); + return true; + case V32HImode: + emit_insn (gen_avx512bw_vpermi2varv32hi3 (target, op0, + force_reg (V32HImode, mask), op1)); + return true; + case V4SImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv4si3 (target, op0, + force_reg (V4SImode, mask), op1)); + return true; + case V8SImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv8si3 (target, op0, + force_reg (V8SImode, mask), op1)); + return true; case V16SImode: emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0, force_reg (V16SImode, mask), op1)); return true; + case V4SFmode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv4sf3 (target, op0, + force_reg (V4SImode, mask), op1)); + return true; + case V8SFmode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv8sf3 (target, op0, + force_reg (V8SImode, mask), op1)); + return true; case V16SFmode: emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0, force_reg (V16SImode, mask), op1)); return true; + case V2DImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv2di3 (target, op0, + force_reg (V2DImode, mask), op1)); + return true; + case V4DImode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv4di3 (target, op0, + force_reg (V4DImode, mask), op1)); + return true; case V8DImode: emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0, force_reg (V8DImode, mask), op1)); return true; + case V2DFmode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv2df3 (target, op0, + force_reg (V2DImode, mask), op1)); + return true; + case V4DFmode: + if (!TARGET_AVX512VL) + return false; + emit_insn (gen_avx512vl_vpermi2varv4df3 (target, op0, + force_reg (V4DImode, mask), op1)); + return true; case V8DFmode: emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0, force_reg (V8DImode, mask), op1)); @@ -21779,6 +21872,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) switch (imode) { + case V64QImode: + if (unsigned_p) + unpack = gen_avx512bw_zero_extendv32qiv32hi2; + else + unpack = gen_avx512bw_sign_extendv32qiv32hi2; + halfmode = V32QImode; + extract + = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi; + break; case V32QImode: if (unsigned_p) unpack = gen_avx2_zero_extendv16qiv16hi2; @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) if (d->one_operand_p) return false; - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 && + GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4) + ; + else if (TARGET_AVX512VL) + ; + else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; @@ -42693,12 +42800,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) switch (vmode) { + case V8DFmode: + case V16SFmode: case V4DFmode: case V8SFmode: case V2DFmode: case V4SFmode: case V8HImode: case V8SImode: + case V32HImode: + case V64QImode: + case V16SImode: + case V8DImode: for (i = 0; i < nelt; ++i) mask |= (d->perm[i] >= nelt) << i; break; @@ -42921,9 +43034,9 @@ static bool expand_vec_perm_pshufb (struct expand_vec_perm_d *d) { unsigned i, nelt, eltsz, mask; - unsigned char perm[32]; + unsigned char perm[64]; enum machine_mode vmode = V16QImode; - rtx rperm[32], vperm, target, op0, op1; + rtx rperm[64], vperm, target, op0, op1; nelt = d->nelt; @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return false; } } + else if (GET_MODE_SIZE (d->vmode) == 64) + { + if (!TARGET_AVX512BW) + return false; + if (vmode == V64QImode) + { + for (i = 0; i < nelt; ++i) + if ((d->perm[i] ^ i) & (nelt / 4)) + return false; + } + } else return false; } @@ -43029,6 +43153,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) mask = 2 * nelt - 1; else if (vmode == V16QImode) mask = nelt - 1; + else if (vmode == V64QImode) + mask = nelt / 4 - 1; else mask = nelt / 2 - 1; @@ -43054,6 +43180,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); else if (vmode == V32QImode) emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); + else if (vmode == V64QImode) + emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm)); else if (vmode == V8SFmode) emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); else @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) rtx (*gen) (rtx, rtx) = NULL; switch (d->vmode) { + case V64QImode: + if (TARGET_AVX512VL) + gen = gen_avx512bw_vec_dupv64qi; + break; case V32QImode: gen = gen_avx2_pbroadcastv32qi_1; break; + case V32HImode: + if (TARGET_AVX512VL) + gen = gen_avx512bw_vec_dupv32hi; + break; case V16HImode: gen = gen_avx2_pbroadcastv16hi_1; break; + case V16SImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16si; + break; case V8SImode: gen = gen_avx2_pbroadcastv8si_1; break; @@ -43124,9 +43264,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) case V8HImode: gen = gen_avx2_pbroadcastv8hi; break; + case V16SFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16sf; + break; case V8SFmode: gen = gen_avx2_vec_dupv8sf_1; break; + case V8DFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8df; + break; + case V8DImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8di; + break; /* For other modes prefer other shuffles this function creates. */ default: break; } @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) mode = V8DImode; else if (mode == V16SFmode) mode = V16SImode; + else if (mode == V4DFmode) + mode = V4DImode; + else if (mode == V2DFmode) + mode = V2DImode; + else if (mode == V8SFmode) + mode = V8SImode; + else if (mode == V4SFmode) + mode = V4SImode; for (i = 0; i < nelt; ++i) vec[i] = GEN_INT (d->perm[i]); rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec)); @@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) return true; /* Try sequences of two instructions. */ + /* ix86_expand_vec_perm_vpermi2 is also called from + * ix86_expand_vec_perm. So it doesn't take d as parameter. + * Construct needed params. */ + rtx vec[64]; + int i; + for (i = 0; i < d->nelt; ++i) + vec[i] = GEN_INT (d->perm[i]); + rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec)); + if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1)) + return true; if (expand_vec_perm_pshuflw_pshufhw (d)) return true; @@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode, /* Given sufficient ISA support we can just return true here for selected vector modes. */ if (d.vmode == V16SImode || d.vmode == V16SFmode - || d.vmode == V8DFmode || d.vmode == V8DImode) + || d.vmode == V8DFmode || d.vmode == V8DImode + || d.vmode == V32HImode || d.vmode == V64QImode) /* All implementable with a single vpermi2 insn. */ return true; if (GET_MODE_SIZE (d.vmode) == 16) @@ -45066,6 +45237,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) gen_il = gen_avx2_interleave_lowv32qi; gen_ih = gen_avx2_interleave_highv32qi; break; + case V64QImode: + himode = V32HImode; + gen_il = gen_avx512bw_interleave_lowv64qi; + gen_ih = gen_avx512bw_interleave_highv64qi; + break; default: gcc_unreachable (); } @@ -45126,7 +45302,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) { /* For SSE2, we used an full interleave, so the desired results are in the even elements. */ - for (i = 0; i < 32; ++i) + for (i = 0; i < 64; ++i) d.perm[i] = i * 2; } else @@ -45134,7 +45310,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) /* For AVX, the interleave used above was not cross-lane. So the extraction is evens but with the second and third quarter swapped. Happily, that is even one insn shorter than even extraction. */ - for (i = 0; i < 32; ++i) + for (i = 0; i < 64; ++i) d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0); } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index bb6372a..d3e9635 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -298,7 +298,7 @@ [V8DI (V4DI "TARGET_AVX512VL")]) (define_mode_iterator VI1_AVX2 - [(V32QI "TARGET_AVX2") V16QI]) + [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI]) (define_mode_iterator VI2_AVX2 [(V16HI "TARGET_AVX2") V8HI]) @@ -10621,7 +10621,8 @@ (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") - (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")]) + (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") + (V32HI "TARGET_AVX512BW") (V64QI "TARGET_AVX512BW")]) (define_expand "vec_perm<mode>" [(match_operand:VEC_PERM_AVX2 0 "register_operand") @@ -10642,7 +10643,8 @@ (V8SI "TARGET_AVX") (V4DI "TARGET_AVX") (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") - (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) + (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") + (V32HI "TARGET_AVX512BW")]) (define_expand "vec_perm_const<mode>" [(match_operand:VEC_PERM_CONST 0 "register_operand") @@ -13559,21 +13561,21 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "<ssse3_avx2>_pshufb<mode>3" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") +(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>" + [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v") (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "register_operand" "0,x") - (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")] + [(match_operand:VI1_AVX2 1 "register_operand" "0,v") + (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,vm")] UNSPEC_PSHUFB))] - "TARGET_SSSE3" + "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" "@ pshufb\t{%2, %0|%0, %2} - vpshufb\t{%2, %1, %0|%0, %1, %2}" + vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog1") (set_attr "prefix_data16" "1,*") (set_attr "prefix_extra" "1") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "orig,maybe_evex") (set_attr "btver2_decode" "vector,vector") (set_attr "mode" "<sseinsnmode>")]) ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen. 2014-10-06 12:55 [PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen Kirill Yukhin @ 2014-10-06 14:10 ` Jakub Jelinek 2014-10-09 12:19 ` [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi) Ilya Tocar 0 siblings, 1 reply; 11+ messages in thread From: Jakub Jelinek @ 2014-10-06 14:10 UTC (permalink / raw) To: Kirill Yukhin; +Cc: Uros Bizjak, Richard Henderson, GCC Patches On Mon, Oct 06, 2014 at 04:55:28PM +0400, Kirill Yukhin wrote: > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) > enum machine_mode mode = GET_MODE (op0); > switch (mode) > { > + /* There is no byte version of vpermi2. So we use vpermi2w. */ > + case V64QImode: > + if (!TARGET_AVX512BW) > + return false; > + rtx mask_lowpart, op0_lowpart, op1_lowpart; > + rtx perm_lo, perm_hi, tmp, res_lo, tmp2, res_hi; > + > + mask_lowpart = gen_lowpart (V32HImode, force_reg (V64QImode, mask)); > + op0_lowpart = gen_lowpart (V32HImode, op0); > + op1_lowpart = gen_lowpart (V32HImode, op1); > + tmp = gen_reg_rtx (V32HImode); > + tmp2 = gen_reg_rtx (V32HImode); > + perm_lo = gen_reg_rtx (V32HImode); > + perm_hi = gen_reg_rtx (V32HImode); > + res_lo = gen_reg_rtx (V32HImode); > + res_hi = gen_reg_rtx (V32HImode); > + > + emit_insn (gen_ashlv32hi3 (tmp, mask_lowpart, GEN_INT (8))); > + emit_insn (gen_ashrv32hi3 (perm_lo, tmp, GEN_INT (9))); > + emit_insn (gen_ashrv32hi3 (perm_hi, mask_lowpart, GEN_INT (9))); > + emit_insn (gen_avx512bw_vpermi2varv32hi3 (res_lo, op0_lowpart, > + perm_lo, op1_lowpart)); > + emit_insn (gen_avx512bw_vpermi2varv32hi3 (tmp2, op0_lowpart, > + perm_hi, op1_lowpart)); > + emit_insn (gen_ashlv32hi3 (res_hi, tmp2, GEN_INT (8))); > + emit_insn (gen_avx512bw_blendmv64qi (target, gen_lowpart (V64QImode, res_lo), > + gen_lowpart (V64QImode, res_hi), > + force_reg (DImode, GEN_INT (0xAAAAAAAAAAAAAAAALL)))); > + return true; I believe this case doesn't belong to this function, other than this case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and so it should always do that, and there should be a separate function that expands the worst case of V64QImode full 2 operand permutation. See my previous mail, IMHO it is doable with 5 instructions rather than 7. And IMHO we should have a separate function which emits that, supposedly one for the constant permutations, one for the variable case (perhaps then your 7 insn sequence is best?). Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers, supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes right now plus D, either D would be NULL (then it would behave as now), or SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed. I.e. the function would start with a switch that would just contain the if (...) return false; hunks plus break; for the success case, then code to generate CONST_VECTOR if sel is NULL_RTX from d, and finally another switch with just the emit cases. Or, the first switch could just set a function pointer before break, and just use one common emit_insn (gen (target, op0, force_reg (vmode, mask), op1)); > + case V8HImode: > + if (!TARGET_AVX512VL) > + return false; > + emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0, > + force_reg (V8HImode, mask), op1)); > + return true; > + case V16HImode: > + if (!TARGET_AVX512VL) > + return false; > + emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0, > + force_reg (V16HImode, mask), op1)); > + return true; Aren't these two insns there only if both TARGET_AVX512VL && TARGET_AVX512BW? I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I think neither of these depends on the other one in GCC. That's unlike insns where CPUID AVX512VL and AVX512F are mentioned together, because in GCC AVX512VL depends on AVX512F. > @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) > > if (d->one_operand_p) > return false; > - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) > + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 && > + GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4) Formatting, && belongs on the second line. > + ; > + else if (TARGET_AVX512VL) I'd add && GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here. AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones, and the == 32 and == 16 cases are handled because AVX512VL implies TARGET_AVX2 and TARGET_SSE4_1, doesn't it? > @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) > return false; > } > } > + else if (GET_MODE_SIZE (d->vmode) == 64) > + { > + if (!TARGET_AVX512BW) > + return false; > + if (vmode == V64QImode) > + { > + for (i = 0; i < nelt; ++i) > + if ((d->perm[i] ^ i) & (nelt / 4)) > + return false; Missing comment, I'd duplicate the /* vpshufb only works intra lanes, it is not possible to shuffle bytes in between the lanes. */ comment there. > @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) > rtx (*gen) (rtx, rtx) = NULL; > switch (d->vmode) > { > + case V64QImode: > + if (TARGET_AVX512VL) VL? Isn't that BW? > + gen = gen_avx512bw_vec_dupv64qi; > + break; > case V32QImode: > gen = gen_avx2_pbroadcastv32qi_1; > break; > + case V32HImode: > + if (TARGET_AVX512VL) Ditto. > @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) > mode = V8DImode; > else if (mode == V16SFmode) > mode = V16SImode; > + else if (mode == V4DFmode) > + mode = V4DImode; > + else if (mode == V2DFmode) > + mode = V2DImode; > + else if (mode == V8SFmode) > + mode = V8SImode; > + else if (mode == V4SFmode) > + mode = V4SImode; > for (i = 0; i < nelt; ++i) > vec[i] = GEN_INT (d->perm[i]); > rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec)); See above comment about CONST_VECTOR. > @@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) > return true; > > /* Try sequences of two instructions. */ > + /* ix86_expand_vec_perm_vpermi2 is also called from > + * ix86_expand_vec_perm. So it doesn't take d as parameter. > + * Construct needed params. */ > + rtx vec[64]; > + int i; > + for (i = 0; i < d->nelt; ++i) > + vec[i] = GEN_INT (d->perm[i]); > + rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec)); > + if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1)) > + return true; > > if (expand_vec_perm_pshuflw_pshufhw (d)) > return true; I don't understand this. Doesn't ix86_expand_vec_perm_vpermi2 generate (except for the V64QI case discussed above) a single insn? Then expand_vec_perm_1 should have handled that already, so this is just a waste of resources here. > @@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode, > /* Given sufficient ISA support we can just return true here > for selected vector modes. */ > if (d.vmode == V16SImode || d.vmode == V16SFmode > - || d.vmode == V8DFmode || d.vmode == V8DImode) > + || d.vmode == V8DFmode || d.vmode == V8DImode > + || d.vmode == V32HImode || d.vmode == V64QImode) > /* All implementable with a single vpermi2 insn. */ > return true; 1) Shouldn't this be guarded with TARGET_AVX512F && and in the V32HImode/V64QImode also with TARGET_AVX512BW? The comment is not correct for V64QImode. 2) For TARGET_AVX512VL, vpermi2 can handle also smaller mode sizes. Perhaps it would be best to turn this into switch (d.vmode) { case V16SImode: case V16SFmode: case V8DFmode: case V8DImode: if (TARGET_AVX512F) /* All implementable with a single vpermi2 insn. */ return true; break; case V32HImode: if (TARGET_AVX512BW) /* Implementable with a single vpermi2 insn. */ return true; break; case V64HImode: if (TARGET_AVX512BW) /* Implementable with 2 vpermi2w, 2 vpshufb and one vpor insns. */ return true; break; case V8SImode: case V8SFmode: case V4DFmode: case V4DImode: if (TARGET_AVX512VL) /* Implementable with a single vpermi2 insn. */ return true; break; case V16HImode: if (TARGET_AVX512VL && TARGET_AVX512BW) /* Implementable with a single vpermi2 insn. */ return true; if (TARGET_AVX2) /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ return true; break; case V32QImode: if (TARGET_AVX2) /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ return true; break; case V4SImode: case V4SFmode: case V8HImode: case V16QImode: /* All implementable with a single vpperm insn. */ if (TARGET_XOP) return true; /* All implementable with 2 pshufb + 1 ior. */ if (TARGET_SSSE3) return true; break; case V2DImode: case V2DFmode: /* All implementable with shufpd or unpck[lh]pd. */ return true; } Now, for V8SI/V8SF/V4DI/V4DF, I wonder if we have (for either AVX or AVX2) any expanders that guarantee we generate some sequence for all possible 2 operand constant permutations. I think ix86_expand_vec_perm is able to emit the non-constant permutations for all of these, so in theory we should have an upper bound for all these. Jakub ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi). 2014-10-06 14:10 ` Jakub Jelinek @ 2014-10-09 12:19 ` Ilya Tocar 2014-10-09 18:51 ` Jakub Jelinek 0 siblings, 1 reply; 11+ messages in thread From: Ilya Tocar @ 2014-10-09 12:19 UTC (permalink / raw) To: Jakub Jelinek; +Cc: Kirill Yukhin, Uros Bizjak, Richard Henderson, GCC Patches Hi, I think this patch should be split in 2 parts: V64QI related and non-V64QI related. This part contains non-V64QI related changes. Also I've noticed, that not all patterns using VI1_AVX2, actually have AVX512 versions, so fixed bogus patterns. On 06 Oct 16:10, Jakub Jelinek wrote: > On Mon, Oct 06, 2014 at 04:55:28PM +0400, Kirill Yukhin wrote: > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -21364,20 +21364,113 @@ ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) > > enum machine_mode mode = GET_MODE (op0); > > switch (mode) > > { > > + /* There is no byte version of vpermi2. So we use vpermi2w. */ > > + case V64QImode: ... > > I believe this case doesn't belong to this function, other than this > case ix86_expand_vec_perm_vpermi2 emits always just a single insn, and > so it should always do that, and there should be a separate function > that expands the worst case of V64QImode full 2 operand permutation. > See my previous mail, IMHO it is doable with 5 instructions rather than 7. > And IMHO we should have a separate function which emits that, supposedly > one for the constant permutations, one for the variable case (perhaps > then your 7 insn sequence is best?). This will be done in following patch. > > Also, IMHO rather than building a CONST_VECTOR ahead in each of the callers, > supposedly ix86_expand_vec_perm_vpermi2 could take the arguments it takes > right now plus D, either D would be NULL (then it would behave as now), or > SEL would be NULL, then it would create a CONST_VECTOR on the fly if needed. > I.e. the function would start with a switch that would just contain the > if (...) > return false; > hunks plus break; for the success case, then code to generate CONST_VECTOR > if sel is NULL_RTX from d, and finally another switch with just the emit > cases. Done. > > > + case V8HImode: > > + if (!TARGET_AVX512VL) > > + return false; > > + emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0, > > + force_reg (V8HImode, mask), op1)); > > + return true; > > + case V16HImode: > > + if (!TARGET_AVX512VL) > > + return false; > > + emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0, > > + force_reg (V16HImode, mask), op1)); > > + return true; > > Aren't these two insns there only if both TARGET_AVX512VL && TARGET_AVX512BW? > I mean, the ISA pdf mentions both of the CPUID flags simultaneously, and I > think neither of these depends on the other one in GCC. That's unlike insns > where CPUID AVX512VL and AVX512F are mentioned together, because in GCC > AVX512VL depends on AVX512F. > Good catch! > > @@ -42662,7 +42764,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) > > > > if (d->one_operand_p) > > return false; > > - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) > > + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 && > > + GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4) > > Formatting, && belongs on the second line. > Fixed. > > + ; > > + else if (TARGET_AVX512VL) > > I'd add && GET_MODE_SIZE (GET_MODE_INNER (vmode) == 64 here. > AVX512VL is not going to handle 64-bit vectors, or 1024-bit ones, > and the == 32 and == 16 cases are handled because AVX512VL implies > TARGET_AVX2 and TARGET_SSE4_1, doesn't it? > As TARGET_AVX512VL always implies TARGET_AVX2 and TARGET_SSE4_1 and works only on 32/16-byte mode this case is redundant, so I've removed it. > > @@ -43012,6 +43125,17 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) > > return false; > > } > > } > > + else if (GET_MODE_SIZE (d->vmode) == 64) > > + { > > + if (!TARGET_AVX512BW) > > + return false; > > + if (vmode == V64QImode) > > + { > > + for (i = 0; i < nelt; ++i) > > + if ((d->perm[i] ^ i) & (nelt / 4)) > > + return false; > > Missing comment, I'd duplicate the > /* vpshufb only works intra lanes, it is not > possible to shuffle bytes in between the lanes. */ > comment there. > Done. > > @@ -43109,12 +43237,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) > > rtx (*gen) (rtx, rtx) = NULL; > > switch (d->vmode) > > { > > + case V64QImode: > > + if (TARGET_AVX512VL) > > VL? Isn't that BW? > > > + gen = gen_avx512bw_vec_dupv64qi; > > + break; > > case V32QImode: > > gen = gen_avx2_pbroadcastv32qi_1; > > break; > > + case V32HImode: > > + if (TARGET_AVX512VL) > > Ditto. > Fixed. > > @@ -43216,6 +43368,14 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) > > mode = V8DImode; > > else if (mode == V16SFmode) > > mode = V16SImode; > > + else if (mode == V4DFmode) > > + mode = V4DImode; > > + else if (mode == V2DFmode) > > + mode = V2DImode; > > + else if (mode == V8SFmode) > > + mode = V8SImode; > > + else if (mode == V4SFmode) > > + mode = V4SImode; > > for (i = 0; i < nelt; ++i) > > vec[i] = GEN_INT (d->perm[i]); > > rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec)); > > See above comment about CONST_VECTOR. > Done. > > @@ -44759,6 +44919,16 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) > > return true; > > > > /* Try sequences of two instructions. */ > > + /* ix86_expand_vec_perm_vpermi2 is also called from > > + * ix86_expand_vec_perm. So it doesn't take d as parameter. > > + * Construct needed params. */ > > + rtx vec[64]; > > + int i; > > + for (i = 0; i < d->nelt; ++i) > > + vec[i] = GEN_INT (d->perm[i]); > > + rtx sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, vec)); > > + if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, sel, d->op1)) > > + return true; > > > > if (expand_vec_perm_pshuflw_pshufhw (d)) > > return true; > > I don't understand this. Doesn't ix86_expand_vec_perm_vpermi2 generate > (except for the V64QI case discussed above) a single insn? Then > expand_vec_perm_1 should have handled that already, so this is just a waste > of resources here. > Removed. > > @@ -44933,7 +45103,8 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode, > > /* Given sufficient ISA support we can just return true here > > for selected vector modes. */ > > if (d.vmode == V16SImode || d.vmode == V16SFmode > > - || d.vmode == V8DFmode || d.vmode == V8DImode) > > + || d.vmode == V8DFmode || d.vmode == V8DImode > > + || d.vmode == V32HImode || d.vmode == V64QImode) > > /* All implementable with a single vpermi2 insn. */ > > return true; > > 1) Shouldn't this be guarded with TARGET_AVX512F && > and in the V32HImode/V64QImode also with TARGET_AVX512BW? > The comment is not correct for V64QImode. > This are probably no 512-bit modes without AVX512F, but I've refactored it as per your suggestion below. > 2) For TARGET_AVX512VL, vpermi2 can handle also smaller mode sizes. > Perhaps it would be best to turn this into > switch (d.vmode) > { > case V16SImode: > case V16SFmode: > case V8DFmode: > case V8DImode: > if (TARGET_AVX512F) > /* All implementable with a single vpermi2 insn. */ > break; ... > > Now, for V8SI/V8SF/V4DI/V4DF, I wonder if we have (for either AVX or AVX2) > any expanders that guarantee we generate some sequence for all possible > 2 operand constant permutations. I think ix86_expand_vec_perm is able > to emit the non-constant permutations for all of these, so in theory > we should have an upper bound for all these. > I'm not sure about it, so for now I've left V8SI/V8SF/V4DI/V4DF out. Updated patch below: gcc/ * config/i386/i386.c (MAX_VECT_LEN): Move above ix86_expand_vec_perm_vpermi2. (struct expand_vec_perm_d): Ditto. (ix86_expand_vec_perm_vpermi2): Handle V8HImode, V16HImode, V2DFmode, V32HImode, V4SImode, V8SImode, V4SFmode, V8SFmode, V2DImode, V4DImode, V4DFmode. (ix86_expand_vec_perm): Update ix86_expand_vec_perm_vpermi2 signature. (ix86_expand_sse_unpack): Handle V64QImode. (expand_vec_perm_blend): Update conditions for TARGET, handle V8DFmode, V16SFmode, V32HImode, V64QImode, V16SImode, V8DImode. (expand_vec_perm_pshufb): Handle V64QImode. (expand_vec_perm_1): Handle V64QImode, V32HImode, V16SImode, V16SFmode, V8DFmode, V8DImode, V4DFmode, V2DFmode, V8SFmode, V4SFmode. (ix86_expand_vec_perm_const_1): Call ix86_expand_vec_perm_vpermi2. (ix86_vectorize_vec_perm_const_ok): Handle V32HImode, V64QImode. (ix86_expand_vecop_qihi): Handle V64QImode. * config/i386/sse.md (define_mode_iterator VI1_AVX512): New. (define_mode_iterator VEC_PERM_AVX2): Add V32HI mode. (define_mode_iterator VEC_PERM_CONST): Add V32HI mode. (define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>"): Add masking. (mul<mode>3): Use VI1_AVX512. (<sse2_avx2>_packsswb): Ditto. (<sse2_avx2>_packuswb): Ditto. (<ssse3_avx2>_pshufb<mode>3): Ditto. (<shift_insn><mode>3): Ditto. --- gcc/config/i386/i386.c | 293 ++++++++++++++++++++++++++++++++++++++++++------- gcc/config/i386/sse.md | 45 ++++---- 2 files changed, 278 insertions(+), 60 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 352ab81..426ea9e 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[]) return true; } +/* AVX512F does support 64-byte integer vector operations, + thus the longest vector we are faced with is V64QImode. */ +#define MAX_VECT_LEN 64 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + enum machine_mode vmode; + unsigned char nelt; + bool one_operand_p; + bool testing_p; +}; + static bool -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, struct expand_vec_perm_d *d) { - enum machine_mode mode = GET_MODE (op0); + enum machine_mode mode = GET_MODE (d ? d->op0 : op0); + switch (mode) { + case V8HImode: + if (!TARGET_AVX512VL || !TARGET_AVX512BW) + return false; + break; + case V16HImode: + if (!TARGET_AVX512VL || !TARGET_AVX512BW) + return false; + case V32HImode: + if (!TARGET_AVX512BW) + return false; + break; + case V4SImode: + if (!TARGET_AVX512VL) + return false; + break; + case V8SImode: + if (!TARGET_AVX512VL) + return false; + break; + case V16SImode: + if (!TARGET_AVX512F) + return false; + break; + case V4SFmode: + if (!TARGET_AVX512VL) + return false; + break; + case V8SFmode: + if (!TARGET_AVX512VL) + return false; + break; + case V16SFmode: + if (!TARGET_AVX512F) + return false; + break; + case V2DImode: + if (!TARGET_AVX512VL) + return false; + break; + case V4DImode: + if (!TARGET_AVX512VL) + return false; + break; + case V8DImode: + if (!TARGET_AVX512F) + return false; + break; + case V2DFmode: + if (!TARGET_AVX512VL) + return false; + break; + case V4DFmode: + if (!TARGET_AVX512VL) + return false; + break; + case V8DFmode: + if (!TARGET_AVX512F) + return false; + break; + default: + return false; + } + + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const expander, + so args are either in d, or in op0, op1 etc. */ + if (d) + { + rtx vec[64]; + target = d->target; + op0 = d->op0; + op1 = d->op1; + for (int i = 0; i < d->nelt; ++i) + vec[i] = GEN_INT (d->perm[i]); + mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (d->nelt, vec)); + } + + switch (mode) + { + case V8HImode: + emit_insn (gen_avx512vl_vpermi2varv8hi3 (target, op0, + force_reg (V8HImode, mask), op1)); + return true; + case V16HImode: + emit_insn (gen_avx512vl_vpermi2varv16hi3 (target, op0, + force_reg (V16HImode, mask), op1)); + return true; + case V32HImode: + emit_insn (gen_avx512bw_vpermi2varv32hi3 (target, op0, + force_reg (V32HImode, mask), op1)); + return true; + case V4SImode: + emit_insn (gen_avx512vl_vpermi2varv4si3 (target, op0, + force_reg (V4SImode, mask), op1)); + return true; + case V8SImode: + emit_insn (gen_avx512vl_vpermi2varv8si3 (target, op0, + force_reg (V8SImode, mask), op1)); + return true; case V16SImode: emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0, force_reg (V16SImode, mask), op1)); return true; + case V4SFmode: + emit_insn (gen_avx512vl_vpermi2varv4sf3 (target, op0, + force_reg (V4SImode, mask), op1)); + return true; + case V8SFmode: + emit_insn (gen_avx512vl_vpermi2varv8sf3 (target, op0, + force_reg (V8SImode, mask), op1)); + return true; case V16SFmode: emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0, force_reg (V16SImode, mask), op1)); return true; + case V2DImode: + emit_insn (gen_avx512vl_vpermi2varv2di3 (target, op0, + force_reg (V2DImode, mask), op1)); + return true; + case V4DImode: + emit_insn (gen_avx512vl_vpermi2varv4di3 (target, op0, + force_reg (V4DImode, mask), op1)); + return true; case V8DImode: emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0, force_reg (V8DImode, mask), op1)); return true; + case V2DFmode: + emit_insn (gen_avx512vl_vpermi2varv2df3 (target, op0, + force_reg (V2DImode, mask), op1)); + return true; + case V4DFmode: + emit_insn (gen_avx512vl_vpermi2varv4df3 (target, op0, + force_reg (V4DImode, mask), op1)); + return true; case V8DFmode: emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0, force_reg (V8DImode, mask), op1)); return true; default: - return false; + gcc_unreachable (); } } @@ -21407,7 +21544,7 @@ ix86_expand_vec_perm (rtx operands[]) e = GET_MODE_UNIT_SIZE (mode); gcc_assert (w <= 64); - if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1)) + if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL)) return; if (TARGET_AVX2) @@ -21779,6 +21916,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) switch (imode) { + case V64QImode: + if (unsigned_p) + unpack = gen_avx512bw_zero_extendv32qiv32hi2; + else + unpack = gen_avx512bw_sign_extendv32qiv32hi2; + halfmode = V32QImode; + extract + = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi; + break; case V32QImode: if (unsigned_p) unpack = gen_avx2_zero_extendv16qiv16hi2; @@ -39603,20 +39749,6 @@ x86_emit_floatuns (rtx operands[2]) emit_label (donelab); } \f -/* AVX512F does support 64-byte integer vector operations, - thus the longest vector we are faced with is V64QImode. */ -#define MAX_VECT_LEN 64 - -struct expand_vec_perm_d -{ - rtx target, op0, op1; - unsigned char perm[MAX_VECT_LEN]; - enum machine_mode vmode; - unsigned char nelt; - bool one_operand_p; - bool testing_p; -}; - static bool canonicalize_perm (struct expand_vec_perm_d *d); static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d); @@ -42662,7 +42794,10 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) if (d->one_operand_p) return false; - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 + && GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4) + ; + else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; @@ -42693,12 +42828,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) switch (vmode) { + case V8DFmode: + case V16SFmode: case V4DFmode: case V8SFmode: case V2DFmode: case V4SFmode: case V8HImode: case V8SImode: + case V32HImode: + case V64QImode: + case V16SImode: + case V8DImode: for (i = 0; i < nelt; ++i) mask |= (d->perm[i] >= nelt) << i; break; @@ -42921,9 +43062,9 @@ static bool expand_vec_perm_pshufb (struct expand_vec_perm_d *d) { unsigned i, nelt, eltsz, mask; - unsigned char perm[32]; + unsigned char perm[64]; enum machine_mode vmode = V16QImode; - rtx rperm[32], vperm, target, op0, op1; + rtx rperm[64], vperm, target, op0, op1; nelt = d->nelt; @@ -43012,6 +43153,19 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return false; } } + else if (GET_MODE_SIZE (d->vmode) == 64) + { + if (!TARGET_AVX512BW) + return false; + if (vmode == V64QImode) + { + /* vpshufb only works intra lanes, it is not + possible to shuffle bytes in between the lanes. */ + for (i = 0; i < nelt; ++i) + if ((d->perm[i] ^ i) & (nelt / 4)) + return false; + } + } else return false; } @@ -43029,6 +43183,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) mask = 2 * nelt - 1; else if (vmode == V16QImode) mask = nelt - 1; + else if (vmode == V64QImode) + mask = nelt / 4 - 1; else mask = nelt / 2 - 1; @@ -43054,6 +43210,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); else if (vmode == V32QImode) emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); + else if (vmode == V64QImode) + emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm)); else if (vmode == V8SFmode) emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); else @@ -43109,12 +43267,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) rtx (*gen) (rtx, rtx) = NULL; switch (d->vmode) { + case V64QImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vec_dupv64qi; + break; case V32QImode: gen = gen_avx2_pbroadcastv32qi_1; break; + case V32HImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vec_dupv32hi; + break; case V16HImode: gen = gen_avx2_pbroadcastv16hi_1; break; + case V16SImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16si; + break; case V8SImode: gen = gen_avx2_pbroadcastv8si_1; break; @@ -43124,9 +43294,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) case V8HImode: gen = gen_avx2_pbroadcastv8hi; break; + case V16SFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16sf; + break; case V8SFmode: gen = gen_avx2_vec_dupv8sf_1; break; + case V8DFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8df; + break; + case V8DImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8di; + break; /* For other modes prefer other shuffles this function creates. */ default: break; } @@ -43210,16 +43392,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) return true; /* Try the AVX512F vpermi2 instructions. */ - rtx vec[64]; - enum machine_mode mode = d->vmode; - if (mode == V8DFmode) - mode = V8DImode; - else if (mode == V16SFmode) - mode = V16SImode; - for (i = 0; i < nelt; ++i) - vec[i] = GEN_INT (d->perm[i]); - rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec)); - if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1)) + if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) return true; return false; @@ -44932,21 +45105,56 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode, /* Given sufficient ISA support we can just return true here for selected vector modes. */ - if (d.vmode == V16SImode || d.vmode == V16SFmode - || d.vmode == V8DFmode || d.vmode == V8DImode) - /* All implementable with a single vpermi2 insn. */ - return true; - if (GET_MODE_SIZE (d.vmode) == 16) + switch (d.vmode) { + case V16SFmode: + case V16SImode: + case V8DImode: + case V8DFmode: + if (TARGET_AVX512F) + /* All implementable with a single vpermi2 insn. */ + return true; + break; + case V32HImode: + if (TARGET_AVX512BW) + /* All implementable with a single vpermi2 insn. */ + return true; + break; + case V8SImode: + case V8SFmode: + case V4DFmode: + case V4DImode: + if (TARGET_AVX512VL) + /* All implementable with a single vpermi2 insn. */ + return true; + break; + case V16HImode: + if (TARGET_AVX2) + /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ + return true; + break; + case V32QImode: + if (TARGET_AVX2) + /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ + return true; + break; + case V4SImode: + case V4SFmode: + case V8HImode: + case V16QImode: /* All implementable with a single vpperm insn. */ if (TARGET_XOP) return true; /* All implementable with 2 pshufb + 1 ior. */ if (TARGET_SSSE3) return true; + break; + case V2DImode: + case V2DFmode: /* All implementable with shufpd or unpck[lh]pd. */ - if (d.nelt == 2) - return true; + return true; + default: + return false; } /* Extract the values from the vector CST into the permutation @@ -45066,6 +45274,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) gen_il = gen_avx2_interleave_lowv32qi; gen_ih = gen_avx2_interleave_highv32qi; break; + case V64QImode: + himode = V32HImode; + gen_il = gen_avx512bw_interleave_lowv64qi; + gen_ih = gen_avx512bw_interleave_highv64qi; + break; default: gcc_unreachable (); } @@ -45126,7 +45339,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) { /* For SSE2, we used an full interleave, so the desired results are in the even elements. */ - for (i = 0; i < 32; ++i) + for (i = 0; i < 64; ++i) d.perm[i] = i * 2; } else @@ -45134,7 +45347,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) /* For AVX, the interleave used above was not cross-lane. So the extraction is evens but with the second and third quarter swapped. Happily, that is even one insn shorter than even extraction. */ - for (i = 0; i < 32; ++i) + for (i = 0; i < 64; ++i) d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0); } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index bb6372a..460cbff 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -300,6 +300,9 @@ (define_mode_iterator VI1_AVX2 [(V32QI "TARGET_AVX2") V16QI]) +(define_mode_iterator VI1_AVX512 + [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI]) + (define_mode_iterator VI2_AVX2 [(V16HI "TARGET_AVX2") V8HI]) @@ -9239,9 +9242,9 @@ (set_attr "mode" "TI")]) (define_expand "mul<mode>3" - [(set (match_operand:VI1_AVX2 0 "register_operand") - (mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand") - (match_operand:VI1_AVX2 2 "register_operand")))] + [(set (match_operand:VI1_AVX512 0 "register_operand") + (mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand") + (match_operand:VI1_AVX512 2 "register_operand")))] "TARGET_SSE2" { ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]); @@ -10621,7 +10624,8 @@ (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") - (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")]) + (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") + (V32HI "TARGET_AVX512BW")]) (define_expand "vec_perm<mode>" [(match_operand:VEC_PERM_AVX2 0 "register_operand") @@ -10642,7 +10646,8 @@ (V8SI "TARGET_AVX") (V4DI "TARGET_AVX") (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") - (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) + (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") + (V32HI "TARGET_AVX512BW")]) (define_expand "vec_perm_const<mode>" [(match_operand:VEC_PERM_CONST 0 "register_operand") @@ -11006,8 +11011,8 @@ }) (define_insn "<sse2_avx2>_packsswb" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") - (vec_concat:VI1_AVX2 + [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x") + (vec_concat:VI1_AVX512 (ss_truncate:<ssehalfvecmode> (match_operand:<sseunpackmode> 1 "register_operand" "0,x")) (ss_truncate:<ssehalfvecmode> @@ -11040,8 +11045,8 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "<sse2_avx2>_packuswb" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") - (vec_concat:VI1_AVX2 + [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x") + (vec_concat:VI1_AVX512 (us_truncate:<ssehalfvecmode> (match_operand:<sseunpackmode> 1 "register_operand" "0,x")) (us_truncate:<ssehalfvecmode> @@ -13559,21 +13564,21 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "<ssse3_avx2>_pshufb<mode>3" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") - (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "register_operand" "0,x") - (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")] +(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>" + [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,v") + (unspec:VI1_AVX512 + [(match_operand:VI1_AVX512 1 "register_operand" "0,v") + (match_operand:VI1_AVX512 2 "nonimmediate_operand" "xm,vm")] UNSPEC_PSHUFB))] - "TARGET_SSSE3" + "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" "@ pshufb\t{%2, %0|%0, %2} - vpshufb\t{%2, %1, %0|%0, %1, %2}" + vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog1") (set_attr "prefix_data16" "1,*") (set_attr "prefix_extra" "1") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "orig,maybe_evex") (set_attr "btver2_decode" "vector,vector") (set_attr "mode" "<sseinsnmode>")]) @@ -15948,9 +15953,9 @@ (set_attr "mode" "TI")]) (define_expand "<shift_insn><mode>3" - [(set (match_operand:VI1_AVX2 0 "register_operand") - (any_shift:VI1_AVX2 - (match_operand:VI1_AVX2 1 "register_operand") + [(set (match_operand:VI1_AVX512 0 "register_operand") + (any_shift:VI1_AVX512 + (match_operand:VI1_AVX512 1 "register_operand") (match_operand:SI 2 "nonmemory_operand")))] "TARGET_SSE2" { -- 1.8.3.1 ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi). 2014-10-09 12:19 ` [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi) Ilya Tocar @ 2014-10-09 18:51 ` Jakub Jelinek 2014-10-10 15:49 ` Ilya Tocar 0 siblings, 1 reply; 11+ messages in thread From: Jakub Jelinek @ 2014-10-09 18:51 UTC (permalink / raw) To: Ilya Tocar; +Cc: Kirill Yukhin, Uros Bizjak, Richard Henderson, GCC Patches On Thu, Oct 09, 2014 at 04:15:23PM +0400, Ilya Tocar wrote: > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[]) > return true; > } > > +/* AVX512F does support 64-byte integer vector operations, > + thus the longest vector we are faced with is V64QImode. */ > +#define MAX_VECT_LEN 64 > + > +struct expand_vec_perm_d > +{ > + rtx target, op0, op1; > + unsigned char perm[MAX_VECT_LEN]; > + enum machine_mode vmode; > + unsigned char nelt; > + bool one_operand_p; > + bool testing_p; > +}; > + > static bool > -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) > +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, struct expand_vec_perm_d *d) Too long line, please wrap it. > { > - enum machine_mode mode = GET_MODE (op0); > + enum machine_mode mode = GET_MODE (d ? d->op0 : op0); > + > switch (mode) > { > + case V8HImode: > + if (!TARGET_AVX512VL || !TARGET_AVX512BW) > + return false; > + break; > + case V16HImode: > + if (!TARGET_AVX512VL || !TARGET_AVX512BW) > + return false; > + case V32HImode: > + if (!TARGET_AVX512BW) > + return false; > + break; > + case V4SImode: > + if (!TARGET_AVX512VL) > + return false; > + break; > + case V8SImode: > + if (!TARGET_AVX512VL) > + return false; > + break; > + case V16SImode: > + if (!TARGET_AVX512F) > + return false; > + break; > + case V4SFmode: > + if (!TARGET_AVX512VL) > + return false; > + break; > + case V8SFmode: > + if (!TARGET_AVX512VL) > + return false; > + break; > + case V16SFmode: > + if (!TARGET_AVX512F) > + return false; > + break; > + case V2DImode: > + if (!TARGET_AVX512VL) > + return false; > + break; > + case V4DImode: > + if (!TARGET_AVX512VL) > + return false; > + break; > + case V8DImode: > + if (!TARGET_AVX512F) > + return false; > + break; > + case V2DFmode: > + if (!TARGET_AVX512VL) > + return false; > + break; > + case V4DFmode: > + if (!TARGET_AVX512VL) > + return false; > + break; > + case V8DFmode: > + if (!TARGET_AVX512F) > + return false; > + break; > + default: > + return false; > + } > + > + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const expander, > + so args are either in d, or in op0, op1 etc. */ > + if (d) > + { > + rtx vec[64]; > + target = d->target; > + op0 = d->op0; > + op1 = d->op1; > + for (int i = 0; i < d->nelt; ++i) > + vec[i] = GEN_INT (d->perm[i]); > + mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (d->nelt, vec)); Shouldn't the mask use integral vector mode rather than floating? My strong preference would be: enum machine_mode maskmode = mode; rtx (*gen) (rtx, rtx, rtx, rtx); right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0); line and then inside of the first switch just do: ... case V16SImode: if (!TARGET_AVX512F) return false; gen = gen_avx512f_vpermi2varv16si3; break; case V4SFmode: if (!TARGET_AVX512VL) return false; gen = gen_avx512vl_vpermi2varv4sf3; maskmode = V4SImode; break; ... etc., then in the mask = line use: mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); and finally instead of the second switch do: emit_insn (gen (target, op0, force_reg (maskmode, mask), op1)); return true; Otherwise, the patch LGTM, but will leave the final approval to Uros. Jakub ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi). 2014-10-09 18:51 ` Jakub Jelinek @ 2014-10-10 15:49 ` Ilya Tocar 2014-10-10 15:59 ` Jakub Jelinek 2014-10-10 16:39 ` Uros Bizjak 0 siblings, 2 replies; 11+ messages in thread From: Ilya Tocar @ 2014-10-10 15:49 UTC (permalink / raw) To: Jakub Jelinek; +Cc: Kirill Yukhin, Uros Bizjak, Richard Henderson, GCC Patches On 09 Oct 20:51, Jakub Jelinek wrote: > On Thu, Oct 09, 2014 at 04:15:23PM +0400, Ilya Tocar wrote: > > --- a/gcc/config/i386/i386.c > > +++ b/gcc/config/i386/i386.c > > @@ -21358,32 +21358,169 @@ ix86_expand_int_vcond (rtx operands[]) > > return true; > > } > > > > -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) > > +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, struct expand_vec_perm_d *d) > > Too long line, please wrap it. > Fixed. > > { > > - enum machine_mode mode = GET_MODE (op0); > > + enum machine_mode mode = GET_MODE (d ? d->op0 : op0); > > + > > switch (mode) > > { > > + case V8HImode: > > + if (!TARGET_AVX512VL || !TARGET_AVX512BW) > > + return false; > > My strong preference would be: > enum machine_mode maskmode = mode; > rtx (*gen) (rtx, rtx, rtx, rtx); > right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0); > line and then inside of the first switch just do: > ... > case V16SImode: > if (!TARGET_AVX512F) > return false; > gen = gen_avx512f_vpermi2varv16si3; > break; > case V4SFmode: > if (!TARGET_AVX512VL) > return false; > gen = gen_avx512vl_vpermi2varv4sf3; > maskmode = V4SImode; > break; > ... > etc., then in the mask = line use: > mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); > and finally instead of the second switch do: > emit_insn (gen (target, op0, force_reg (maskmode, mask), op1)); > return true; > Updated patch below. --- gcc/config/i386/i386.c | 281 +++++++++++++++++++++++++++++++++++++++---------- gcc/config/i386/sse.md | 45 ++++---- 2 files changed, 253 insertions(+), 73 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 352ab81..2247da8 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21358,33 +21358,132 @@ ix86_expand_int_vcond (rtx operands[]) return true; } +/* AVX512F does support 64-byte integer vector operations, + thus the longest vector we are faced with is V64QImode. */ +#define MAX_VECT_LEN 64 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + enum machine_mode vmode; + unsigned char nelt; + bool one_operand_p; + bool testing_p; +}; + static bool -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, + struct expand_vec_perm_d *d) { - enum machine_mode mode = GET_MODE (op0); + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const + expander, so args are either in d, or in op0, op1 etc. */ + enum machine_mode mode = GET_MODE (d ? d->op0 : op0); + enum machine_mode maskmode = mode; + rtx (*gen) (rtx, rtx, rtx, rtx); + switch (mode) { + case V8HImode: + if (!TARGET_AVX512VL || !TARGET_AVX512BW) + return false; + gen = gen_avx512vl_vpermi2varv8hi3; + break; + case V16HImode: + if (!TARGET_AVX512VL || !TARGET_AVX512BW) + return false; + gen = gen_avx512vl_vpermi2varv16hi3; + break; + case V32HImode: + if (!TARGET_AVX512BW) + return false; + gen = gen_avx512bw_vpermi2varv32hi3; + break; + case V4SImode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv4si3; + break; + case V8SImode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv8si3; + break; case V16SImode: - emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0, - force_reg (V16SImode, mask), - op1)); - return true; + if (!TARGET_AVX512F) + return false; + gen = gen_avx512f_vpermi2varv16si3; + break; + case V4SFmode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv4sf3; + maskmode = V4SImode; + break; + case V8SFmode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv8sf3; + maskmode = V8SImode; + break; case V16SFmode: - emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0, - force_reg (V16SImode, mask), - op1)); - return true; + if (!TARGET_AVX512F) + return false; + gen = gen_avx512f_vpermi2varv16sf3; + maskmode = V16SImode; + break; + case V2DImode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv2di3; + break; + case V4DImode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv4di3; + break; case V8DImode: - emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0, - force_reg (V8DImode, mask), op1)); - return true; + if (!TARGET_AVX512F) + return false; + gen = gen_avx512f_vpermi2varv8di3; + break; + case V2DFmode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv2df3; + maskmode = V2DImode; + break; + case V4DFmode: + if (!TARGET_AVX512VL) + return false; + gen = gen_avx512vl_vpermi2varv4df3; + maskmode = V4DImode; + break; case V8DFmode: - emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0, - force_reg (V8DImode, mask), op1)); - return true; + if (!TARGET_AVX512F) + return false; + gen = gen_avx512f_vpermi2varv8df3; + maskmode = V8DImode; + break; default: return false; } + + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const + expander, so args are either in d, or in op0, op1 etc. */ + if (d) + { + rtx vec[64]; + target = d->target; + op0 = d->op0; + op1 = d->op1; + for (int i = 0; i < d->nelt; ++i) + vec[i] = GEN_INT (d->perm[i]); + mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); + } + + emit_insn (gen (target, op0, force_reg (maskmode, mask), op1)); + return true; } /* Expand a variable vector permutation. */ @@ -21407,7 +21506,7 @@ ix86_expand_vec_perm (rtx operands[]) e = GET_MODE_UNIT_SIZE (mode); gcc_assert (w <= 64); - if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1)) + if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL)) return; if (TARGET_AVX2) @@ -21779,6 +21878,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) switch (imode) { + case V64QImode: + if (unsigned_p) + unpack = gen_avx512bw_zero_extendv32qiv32hi2; + else + unpack = gen_avx512bw_sign_extendv32qiv32hi2; + halfmode = V32QImode; + extract + = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi; + break; case V32QImode: if (unsigned_p) unpack = gen_avx2_zero_extendv16qiv16hi2; @@ -39603,20 +39711,6 @@ x86_emit_floatuns (rtx operands[2]) emit_label (donelab); } \f -/* AVX512F does support 64-byte integer vector operations, - thus the longest vector we are faced with is V64QImode. */ -#define MAX_VECT_LEN 64 - -struct expand_vec_perm_d -{ - rtx target, op0, op1; - unsigned char perm[MAX_VECT_LEN]; - enum machine_mode vmode; - unsigned char nelt; - bool one_operand_p; - bool testing_p; -}; - static bool canonicalize_perm (struct expand_vec_perm_d *d); static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d); @@ -42662,7 +42756,10 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) if (d->one_operand_p) return false; - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 + && GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4) + ; + else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; @@ -42693,12 +42790,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) switch (vmode) { + case V8DFmode: + case V16SFmode: case V4DFmode: case V8SFmode: case V2DFmode: case V4SFmode: case V8HImode: case V8SImode: + case V32HImode: + case V64QImode: + case V16SImode: + case V8DImode: for (i = 0; i < nelt; ++i) mask |= (d->perm[i] >= nelt) << i; break; @@ -42921,9 +43024,9 @@ static bool expand_vec_perm_pshufb (struct expand_vec_perm_d *d) { unsigned i, nelt, eltsz, mask; - unsigned char perm[32]; + unsigned char perm[64]; enum machine_mode vmode = V16QImode; - rtx rperm[32], vperm, target, op0, op1; + rtx rperm[64], vperm, target, op0, op1; nelt = d->nelt; @@ -43012,6 +43115,19 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return false; } } + else if (GET_MODE_SIZE (d->vmode) == 64) + { + if (!TARGET_AVX512BW) + return false; + if (vmode == V64QImode) + { + /* vpshufb only works intra lanes, it is not + possible to shuffle bytes in between the lanes. */ + for (i = 0; i < nelt; ++i) + if ((d->perm[i] ^ i) & (nelt / 4)) + return false; + } + } else return false; } @@ -43029,6 +43145,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) mask = 2 * nelt - 1; else if (vmode == V16QImode) mask = nelt - 1; + else if (vmode == V64QImode) + mask = nelt / 4 - 1; else mask = nelt / 2 - 1; @@ -43054,6 +43172,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); else if (vmode == V32QImode) emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); + else if (vmode == V64QImode) + emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm)); else if (vmode == V8SFmode) emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); else @@ -43109,12 +43229,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) rtx (*gen) (rtx, rtx) = NULL; switch (d->vmode) { + case V64QImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vec_dupv64qi; + break; case V32QImode: gen = gen_avx2_pbroadcastv32qi_1; break; + case V32HImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vec_dupv32hi; + break; case V16HImode: gen = gen_avx2_pbroadcastv16hi_1; break; + case V16SImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16si; + break; case V8SImode: gen = gen_avx2_pbroadcastv8si_1; break; @@ -43124,9 +43256,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) case V8HImode: gen = gen_avx2_pbroadcastv8hi; break; + case V16SFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16sf; + break; case V8SFmode: gen = gen_avx2_vec_dupv8sf_1; break; + case V8DFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8df; + break; + case V8DImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8di; + break; /* For other modes prefer other shuffles this function creates. */ default: break; } @@ -43210,16 +43354,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) return true; /* Try the AVX512F vpermi2 instructions. */ - rtx vec[64]; - enum machine_mode mode = d->vmode; - if (mode == V8DFmode) - mode = V8DImode; - else if (mode == V16SFmode) - mode = V16SImode; - for (i = 0; i < nelt; ++i) - vec[i] = GEN_INT (d->perm[i]); - rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec)); - if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1)) + if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) return true; return false; @@ -44932,21 +45067,56 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode, /* Given sufficient ISA support we can just return true here for selected vector modes. */ - if (d.vmode == V16SImode || d.vmode == V16SFmode - || d.vmode == V8DFmode || d.vmode == V8DImode) - /* All implementable with a single vpermi2 insn. */ - return true; - if (GET_MODE_SIZE (d.vmode) == 16) + switch (d.vmode) { + case V16SFmode: + case V16SImode: + case V8DImode: + case V8DFmode: + if (TARGET_AVX512F) + /* All implementable with a single vpermi2 insn. */ + return true; + break; + case V32HImode: + if (TARGET_AVX512BW) + /* All implementable with a single vpermi2 insn. */ + return true; + break; + case V8SImode: + case V8SFmode: + case V4DFmode: + case V4DImode: + if (TARGET_AVX512VL) + /* All implementable with a single vpermi2 insn. */ + return true; + break; + case V16HImode: + if (TARGET_AVX2) + /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ + return true; + break; + case V32QImode: + if (TARGET_AVX2) + /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ + return true; + break; + case V4SImode: + case V4SFmode: + case V8HImode: + case V16QImode: /* All implementable with a single vpperm insn. */ if (TARGET_XOP) return true; /* All implementable with 2 pshufb + 1 ior. */ if (TARGET_SSSE3) return true; + break; + case V2DImode: + case V2DFmode: /* All implementable with shufpd or unpck[lh]pd. */ - if (d.nelt == 2) - return true; + return true; + default: + return false; } /* Extract the values from the vector CST into the permutation @@ -45066,6 +45236,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) gen_il = gen_avx2_interleave_lowv32qi; gen_ih = gen_avx2_interleave_highv32qi; break; + case V64QImode: + himode = V32HImode; + gen_il = gen_avx512bw_interleave_lowv64qi; + gen_ih = gen_avx512bw_interleave_highv64qi; + break; default: gcc_unreachable (); } @@ -45126,7 +45301,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) { /* For SSE2, we used an full interleave, so the desired results are in the even elements. */ - for (i = 0; i < 32; ++i) + for (i = 0; i < 64; ++i) d.perm[i] = i * 2; } else @@ -45134,7 +45309,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) /* For AVX, the interleave used above was not cross-lane. So the extraction is evens but with the second and third quarter swapped. Happily, that is even one insn shorter than even extraction. */ - for (i = 0; i < 32; ++i) + for (i = 0; i < 64; ++i) d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0); } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index bb6372a..460cbff 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -300,6 +300,9 @@ (define_mode_iterator VI1_AVX2 [(V32QI "TARGET_AVX2") V16QI]) +(define_mode_iterator VI1_AVX512 + [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI]) + (define_mode_iterator VI2_AVX2 [(V16HI "TARGET_AVX2") V8HI]) @@ -9239,9 +9242,9 @@ (set_attr "mode" "TI")]) (define_expand "mul<mode>3" - [(set (match_operand:VI1_AVX2 0 "register_operand") - (mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand") - (match_operand:VI1_AVX2 2 "register_operand")))] + [(set (match_operand:VI1_AVX512 0 "register_operand") + (mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand") + (match_operand:VI1_AVX512 2 "register_operand")))] "TARGET_SSE2" { ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]); @@ -10621,7 +10624,8 @@ (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") - (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")]) + (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") + (V32HI "TARGET_AVX512BW")]) (define_expand "vec_perm<mode>" [(match_operand:VEC_PERM_AVX2 0 "register_operand") @@ -10642,7 +10646,8 @@ (V8SI "TARGET_AVX") (V4DI "TARGET_AVX") (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") - (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) + (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") + (V32HI "TARGET_AVX512BW")]) (define_expand "vec_perm_const<mode>" [(match_operand:VEC_PERM_CONST 0 "register_operand") @@ -11006,8 +11011,8 @@ }) (define_insn "<sse2_avx2>_packsswb" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") - (vec_concat:VI1_AVX2 + [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x") + (vec_concat:VI1_AVX512 (ss_truncate:<ssehalfvecmode> (match_operand:<sseunpackmode> 1 "register_operand" "0,x")) (ss_truncate:<ssehalfvecmode> @@ -11040,8 +11045,8 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "<sse2_avx2>_packuswb" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") - (vec_concat:VI1_AVX2 + [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x") + (vec_concat:VI1_AVX512 (us_truncate:<ssehalfvecmode> (match_operand:<sseunpackmode> 1 "register_operand" "0,x")) (us_truncate:<ssehalfvecmode> @@ -13559,21 +13564,21 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "<ssse3_avx2>_pshufb<mode>3" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") - (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "register_operand" "0,x") - (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")] +(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>" + [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,v") + (unspec:VI1_AVX512 + [(match_operand:VI1_AVX512 1 "register_operand" "0,v") + (match_operand:VI1_AVX512 2 "nonimmediate_operand" "xm,vm")] UNSPEC_PSHUFB))] - "TARGET_SSSE3" + "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" "@ pshufb\t{%2, %0|%0, %2} - vpshufb\t{%2, %1, %0|%0, %1, %2}" + vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog1") (set_attr "prefix_data16" "1,*") (set_attr "prefix_extra" "1") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "orig,maybe_evex") (set_attr "btver2_decode" "vector,vector") (set_attr "mode" "<sseinsnmode>")]) @@ -15948,9 +15953,9 @@ (set_attr "mode" "TI")]) (define_expand "<shift_insn><mode>3" - [(set (match_operand:VI1_AVX2 0 "register_operand") - (any_shift:VI1_AVX2 - (match_operand:VI1_AVX2 1 "register_operand") + [(set (match_operand:VI1_AVX512 0 "register_operand") + (any_shift:VI1_AVX512 + (match_operand:VI1_AVX512 1 "register_operand") (match_operand:SI 2 "nonmemory_operand")))] "TARGET_SSE2" { -- 1.8.3.1 ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi). 2014-10-10 15:49 ` Ilya Tocar @ 2014-10-10 15:59 ` Jakub Jelinek 2014-10-10 16:39 ` Uros Bizjak 1 sibling, 0 replies; 11+ messages in thread From: Jakub Jelinek @ 2014-10-10 15:59 UTC (permalink / raw) To: Ilya Tocar; +Cc: Kirill Yukhin, Uros Bizjak, Richard Henderson, GCC Patches On Fri, Oct 10, 2014 at 07:47:19PM +0400, Ilya Tocar wrote: > Updated patch below. You haven't posted ChangeLog entry this time, so using the last one: * config/i386/i386.c (MAX_VECT_LEN): Move above ix86_expand_vec_perm_vpermi2. ... * config/i386/sse.md (define_mode_iterator VI1_AVX512): New. I'd think you should avoid the line break after filename in these cases, so * config/i386/i386.c (MAX_VECT_LEN): Move above ix86_expand_vec_perm_vpermi2. ... * config/i386/sse.md (define_mode_iterator VI1_AVX512): New. Other than that nit it looks good to me. Jakub ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi). 2014-10-10 15:49 ` Ilya Tocar 2014-10-10 15:59 ` Jakub Jelinek @ 2014-10-10 16:39 ` Uros Bizjak 2014-10-16 10:25 ` Ilya Tocar 1 sibling, 1 reply; 11+ messages in thread From: Uros Bizjak @ 2014-10-10 16:39 UTC (permalink / raw) To: Ilya Tocar; +Cc: Jakub Jelinek, Kirill Yukhin, Richard Henderson, GCC Patches On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar <tocarip.intel@gmail.com> wrote: >> My strong preference would be: >> enum machine_mode maskmode = mode; >> rtx (*gen) (rtx, rtx, rtx, rtx); >> right below the enum machine_mode mode = GET_MODE (d ? d->op0 : op0); >> line and then inside of the first switch just do: >> ... >> case V16SImode: >> if (!TARGET_AVX512F) >> return false; >> gen = gen_avx512f_vpermi2varv16si3; >> break; >> case V4SFmode: >> if (!TARGET_AVX512VL) >> return false; >> gen = gen_avx512vl_vpermi2varv4sf3; >> maskmode = V4SImode; >> break; >> ... >> etc., then in the mask = line use: >> mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); >> and finally instead of the second switch do: >> emit_insn (gen (target, op0, force_reg (maskmode, mask), op1)); >> return true; >> > Updated patch below. Please recode that horrible first switch statement to: --cut here-- rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; switch (mode) { case V8HImode: if (TARGET_AVX512VL && TARGET_AVX152BW) gen = gen_avx512vl_vpermi2varv8hi3; break; ... case V2DFmode: if (TARGET_AVX512VL) { gen = gen_avx512vl_vpermi2varv2df3; maskmode = V2DImode; } break; default: break; } if (gen == NULL) return false; --cut here-- The patch is OK with the above improvement. (Please also note that the patch has a bunch of i386.md changes that will clash with followup patch series). Thanks, Uros. ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi). 2014-10-10 16:39 ` Uros Bizjak @ 2014-10-16 10:25 ` Ilya Tocar 2014-10-16 11:18 ` Jakub Jelinek 2014-10-20 15:20 ` Ilya Tocar 0 siblings, 2 replies; 11+ messages in thread From: Ilya Tocar @ 2014-10-16 10:25 UTC (permalink / raw) To: Uros Bizjak; +Cc: Jakub Jelinek, Kirill Yukhin, Richard Henderson, GCC Patches On 10 Oct 18:37, Uros Bizjak wrote: > On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar <tocarip.intel@gmail.com> wrote: > > > Please recode that horrible first switch statement to: > > --cut here-- > rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; > > switch (mode) > { > case V8HImode: > if (TARGET_AVX512VL && TARGET_AVX152BW) > gen = gen_avx512vl_vpermi2varv8hi3; > break; > > ... > > case V2DFmode: > if (TARGET_AVX512VL) > { > gen = gen_avx512vl_vpermi2varv2df3; > maskmode = V2DImode; > > The patch is OK with the above improvement. > > Thanks, > Uros. > Will commit version below, if no objections in 24 hours. --- gcc/config/i386/i386.c | 292 ++++++++++++++++++++++++++++++++++++++----------- gcc/config/i386/sse.md | 45 ++++---- 2 files changed, 255 insertions(+), 82 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index aedac19..e1228e3 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -21411,35 +21411,132 @@ ix86_expand_int_vcond (rtx operands[]) return true; } +/* AVX512F does support 64-byte integer vector operations, + thus the longest vector we are faced with is V64QImode. */ +#define MAX_VECT_LEN 64 + +struct expand_vec_perm_d +{ + rtx target, op0, op1; + unsigned char perm[MAX_VECT_LEN]; + enum machine_mode vmode; + unsigned char nelt; + bool one_operand_p; + bool testing_p; +}; + static bool -ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1) +ix86_expand_vec_perm_vpermi2 (rtx target, rtx op0, rtx mask, rtx op1, + struct expand_vec_perm_d *d) { - enum machine_mode mode = GET_MODE (op0); + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const + expander, so args are either in d, or in op0, op1 etc. */ + enum machine_mode mode = GET_MODE (d ? d->op0 : op0); + enum machine_mode maskmode = mode; + rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; + switch (mode) { + case V8HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_vpermi2varv8hi3; + break; + case V16HImode: + if (TARGET_AVX512VL && TARGET_AVX512BW) + gen = gen_avx512vl_vpermi2varv16hi3; + break; + case V32HImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vpermi2varv32hi3; + break; + case V4SImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermi2varv4si3; + break; + case V8SImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermi2varv8si3; + break; case V16SImode: - emit_insn (gen_avx512f_vpermi2varv16si3 (target, op0, - force_reg (V16SImode, mask), - op1)); - return true; + if (TARGET_AVX512F) + gen = gen_avx512f_vpermi2varv16si3; + break; + case V4SFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermi2varv4sf3; + maskmode = V4SImode; + } + break; + case V8SFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermi2varv8sf3; + maskmode = V8SImode; + } + break; case V16SFmode: - emit_insn (gen_avx512f_vpermi2varv16sf3 (target, op0, - force_reg (V16SImode, mask), - op1)); - return true; + if (TARGET_AVX512F) + { + gen = gen_avx512f_vpermi2varv16sf3; + maskmode = V16SImode; + } + break; + case V2DImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermi2varv2di3; + break; + case V4DImode: + if (TARGET_AVX512VL) + gen = gen_avx512vl_vpermi2varv4di3; + break; case V8DImode: - emit_insn (gen_avx512f_vpermi2varv8di3 (target, op0, - force_reg (V8DImode, mask), - op1)); - return true; + if (TARGET_AVX512F) + gen = gen_avx512f_vpermi2varv8di3; + break; + case V2DFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermi2varv2df3; + maskmode = V2DImode; + } + break; + case V4DFmode: + if (TARGET_AVX512VL) + { + gen = gen_avx512vl_vpermi2varv4df3; + maskmode = V4DImode; + } + break; case V8DFmode: - emit_insn (gen_avx512f_vpermi2varv8df3 (target, op0, - force_reg (V8DImode, mask), - op1)); - return true; + if (TARGET_AVX512F) + { + gen = gen_avx512f_vpermi2varv8df3; + maskmode = V8DImode; + } + break; default: - return false; + break; } + + if (gen == NULL) + return false; + + /* ix86_expand_vec_perm_vpermi2 is called from both const and non-const + expander, so args are either in d, or in op0, op1 etc. */ + if (d) + { + rtx vec[64]; + target = d->target; + op0 = d->op0; + op1 = d->op1; + for (int i = 0; i < d->nelt; ++i) + vec[i] = GEN_INT (d->perm[i]); + mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec)); + } + + emit_insn (gen (target, op0, force_reg (maskmode, mask), op1)); + return true; } /* Expand a variable vector permutation. */ @@ -21462,8 +21559,7 @@ ix86_expand_vec_perm (rtx operands[]) e = GET_MODE_UNIT_SIZE (mode); gcc_assert (w <= 64); - if (TARGET_AVX512F - && ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1)) + if (ix86_expand_vec_perm_vpermi2 (target, op0, mask, op1, NULL)) return; if (TARGET_AVX2) @@ -21835,6 +21931,15 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p) switch (imode) { + case V64QImode: + if (unsigned_p) + unpack = gen_avx512bw_zero_extendv32qiv32hi2; + else + unpack = gen_avx512bw_sign_extendv32qiv32hi2; + halfmode = V32QImode; + extract + = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi; + break; case V32QImode: if (unsigned_p) unpack = gen_avx2_zero_extendv16qiv16hi2; @@ -39683,20 +39788,6 @@ x86_emit_floatuns (rtx operands[2]) emit_label (donelab); } \f -/* AVX512F does support 64-byte integer vector operations, - thus the longest vector we are faced with is V64QImode. */ -#define MAX_VECT_LEN 64 - -struct expand_vec_perm_d -{ - rtx target, op0, op1; - unsigned char perm[MAX_VECT_LEN]; - enum machine_mode vmode; - unsigned char nelt; - bool one_operand_p; - bool testing_p; -}; - static bool canonicalize_perm (struct expand_vec_perm_d *d); static bool expand_vec_perm_1 (struct expand_vec_perm_d *d); static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d); @@ -42745,7 +42836,10 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) if (d->one_operand_p) return false; - if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) + if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64 + && GET_MODE_SIZE (GET_MODE_INNER (vmode)) >= 4) + ; + else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; @@ -42776,12 +42870,18 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) switch (vmode) { + case V8DFmode: + case V16SFmode: case V4DFmode: case V8SFmode: case V2DFmode: case V4SFmode: case V8HImode: case V8SImode: + case V32HImode: + case V64QImode: + case V16SImode: + case V8DImode: for (i = 0; i < nelt; ++i) mask |= (d->perm[i] >= nelt) << i; break; @@ -43004,9 +43104,9 @@ static bool expand_vec_perm_pshufb (struct expand_vec_perm_d *d) { unsigned i, nelt, eltsz, mask; - unsigned char perm[32]; + unsigned char perm[64]; enum machine_mode vmode = V16QImode; - rtx rperm[32], vperm, target, op0, op1; + rtx rperm[64], vperm, target, op0, op1; nelt = d->nelt; @@ -43095,6 +43195,19 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) return false; } } + else if (GET_MODE_SIZE (d->vmode) == 64) + { + if (!TARGET_AVX512BW) + return false; + if (vmode == V64QImode) + { + /* vpshufb only works intra lanes, it is not + possible to shuffle bytes in between the lanes. */ + for (i = 0; i < nelt; ++i) + if ((d->perm[i] ^ i) & (nelt / 4)) + return false; + } + } else return false; } @@ -43112,6 +43225,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) mask = 2 * nelt - 1; else if (vmode == V16QImode) mask = nelt - 1; + else if (vmode == V64QImode) + mask = nelt / 4 - 1; else mask = nelt / 2 - 1; @@ -43137,6 +43252,8 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); else if (vmode == V32QImode) emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); + else if (vmode == V64QImode) + emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm)); else if (vmode == V8SFmode) emit_insn (gen_avx2_permvarv8sf (target, op0, vperm)); else @@ -43192,12 +43309,24 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) rtx (*gen) (rtx, rtx) = NULL; switch (d->vmode) { + case V64QImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vec_dupv64qi; + break; case V32QImode: gen = gen_avx2_pbroadcastv32qi_1; break; + case V32HImode: + if (TARGET_AVX512BW) + gen = gen_avx512bw_vec_dupv32hi; + break; case V16HImode: gen = gen_avx2_pbroadcastv16hi_1; break; + case V16SImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16si; + break; case V8SImode: gen = gen_avx2_pbroadcastv8si_1; break; @@ -43207,9 +43336,21 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) case V8HImode: gen = gen_avx2_pbroadcastv8hi; break; + case V16SFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv16sf; + break; case V8SFmode: gen = gen_avx2_vec_dupv8sf_1; break; + case V8DFmode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8df; + break; + case V8DImode: + if (TARGET_AVX512F) + gen = gen_avx512f_vec_dupv8di; + break; /* For other modes prefer other shuffles this function creates. */ default: break; } @@ -43294,23 +43435,10 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) /* Try the AVX2 vpalignr instruction. */ if (expand_vec_perm_palignr (d, true)) - return true; /* Try the AVX512F vpermi2 instructions. */ - if (TARGET_AVX512F) - { - rtx vec[64]; - enum machine_mode mode = d->vmode; - if (mode == V8DFmode) - mode = V8DImode; - else if (mode == V16SFmode) - mode = V16SImode; - for (i = 0; i < nelt; ++i) - vec[i] = GEN_INT (d->perm[i]); - rtx mask = gen_rtx_CONST_VECTOR (mode, gen_rtvec_v (nelt, vec)); - if (ix86_expand_vec_perm_vpermi2 (d->target, d->op0, mask, d->op1)) - return true; - } + if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) + return true; return false; } @@ -45097,21 +45225,56 @@ ix86_vectorize_vec_perm_const_ok (enum machine_mode vmode, /* Given sufficient ISA support we can just return true here for selected vector modes. */ - if (d.vmode == V16SImode || d.vmode == V16SFmode - || d.vmode == V8DFmode || d.vmode == V8DImode) - /* All implementable with a single vpermi2 insn. */ - return true; - if (GET_MODE_SIZE (d.vmode) == 16) + switch (d.vmode) { + case V16SFmode: + case V16SImode: + case V8DImode: + case V8DFmode: + if (TARGET_AVX512F) + /* All implementable with a single vpermi2 insn. */ + return true; + break; + case V32HImode: + if (TARGET_AVX512BW) + /* All implementable with a single vpermi2 insn. */ + return true; + break; + case V8SImode: + case V8SFmode: + case V4DFmode: + case V4DImode: + if (TARGET_AVX512VL) + /* All implementable with a single vpermi2 insn. */ + return true; + break; + case V16HImode: + if (TARGET_AVX2) + /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ + return true; + break; + case V32QImode: + if (TARGET_AVX2) + /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */ + return true; + break; + case V4SImode: + case V4SFmode: + case V8HImode: + case V16QImode: /* All implementable with a single vpperm insn. */ if (TARGET_XOP) return true; /* All implementable with 2 pshufb + 1 ior. */ if (TARGET_SSSE3) return true; + break; + case V2DImode: + case V2DFmode: /* All implementable with shufpd or unpck[lh]pd. */ - if (d.nelt == 2) - return true; + return true; + default: + return false; } /* Extract the values from the vector CST into the permutation @@ -45231,6 +45394,11 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) gen_il = gen_avx2_interleave_lowv32qi; gen_ih = gen_avx2_interleave_highv32qi; break; + case V64QImode: + himode = V32HImode; + gen_il = gen_avx512bw_interleave_lowv64qi; + gen_ih = gen_avx512bw_interleave_highv64qi; + break; default: gcc_unreachable (); } @@ -45291,7 +45459,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) { /* For SSE2, we used an full interleave, so the desired results are in the even elements. */ - for (i = 0; i < 32; ++i) + for (i = 0; i < 64; ++i) d.perm[i] = i * 2; } else @@ -45299,7 +45467,7 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2) /* For AVX, the interleave used above was not cross-lane. So the extraction is evens but with the second and third quarter swapped. Happily, that is even one insn shorter than even extraction. */ - for (i = 0; i < 32; ++i) + for (i = 0; i < 64; ++i) d.perm[i] = i * 2 + ((i & 24) == 8 ? 16 : (i & 24) == 16 ? -16 : 0); } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index a6cf363..d78194f 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -301,6 +301,9 @@ (define_mode_iterator VI1_AVX2 [(V32QI "TARGET_AVX2") V16QI]) +(define_mode_iterator VI1_AVX512 + [(V64QI "TARGET_AVX512BW") (V32QI "TARGET_AVX2") V16QI]) + (define_mode_iterator VI2_AVX2 [(V32HI "TARGET_AVX512BW") (V16HI "TARGET_AVX2") V8HI]) @@ -9237,9 +9240,9 @@ (set_attr "mode" "TI")]) (define_expand "mul<mode>3<mask_name>" - [(set (match_operand:VI1_AVX2 0 "register_operand") - (mult:VI1_AVX2 (match_operand:VI1_AVX2 1 "register_operand") - (match_operand:VI1_AVX2 2 "register_operand")))] + [(set (match_operand:VI1_AVX512 0 "register_operand") + (mult:VI1_AVX512 (match_operand:VI1_AVX512 1 "register_operand") + (match_operand:VI1_AVX512 2 "register_operand")))] "TARGET_SSE2 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" { ix86_expand_vecop_qihi (MULT, operands[0], operands[1], operands[2]); @@ -10643,7 +10646,8 @@ (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2") (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") - (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F")]) + (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") + (V32HI "TARGET_AVX512BW")]) (define_expand "vec_perm<mode>" [(match_operand:VEC_PERM_AVX2 0 "register_operand") @@ -10664,7 +10668,8 @@ (V8SI "TARGET_AVX") (V4DI "TARGET_AVX") (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") (V16SI "TARGET_AVX512F") (V8DI "TARGET_AVX512F") - (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F")]) + (V16SF "TARGET_AVX512F") (V8DF "TARGET_AVX512F") + (V32HI "TARGET_AVX512BW")]) (define_expand "vec_perm_const<mode>" [(match_operand:VEC_PERM_CONST 0 "register_operand") @@ -11028,8 +11033,8 @@ }) (define_insn "<sse2_avx2>_packsswb<mask_name>" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v") - (vec_concat:VI1_AVX2 + [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x") + (vec_concat:VI1_AVX512 (ss_truncate:<ssehalfvecmode> (match_operand:<sseunpackmode> 1 "register_operand" "0,v")) (ss_truncate:<ssehalfvecmode> @@ -11062,8 +11067,8 @@ (set_attr "mode" "<sseinsnmode>")]) (define_insn "<sse2_avx2>_packuswb<mask_name>" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,v") - (vec_concat:VI1_AVX2 + [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,x") + (vec_concat:VI1_AVX512 (us_truncate:<ssehalfvecmode> (match_operand:<sseunpackmode> 1 "register_operand" "0,v")) (us_truncate:<ssehalfvecmode> @@ -13641,21 +13646,21 @@ (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)")) (set_attr "mode" "DI")]) -(define_insn "<ssse3_avx2>_pshufb<mode>3" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=x,x") - (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "register_operand" "0,x") - (match_operand:VI1_AVX2 2 "nonimmediate_operand" "xm,xm")] +(define_insn "<ssse3_avx2>_pshufb<mode>3<mask_name>" + [(set (match_operand:VI1_AVX512 0 "register_operand" "=x,v") + (unspec:VI1_AVX512 + [(match_operand:VI1_AVX512 1 "register_operand" "0,v") + (match_operand:VI1_AVX512 2 "nonimmediate_operand" "xm,vm")] UNSPEC_PSHUFB))] - "TARGET_SSSE3" + "TARGET_SSSE3 && <mask_mode512bit_condition> && <mask_avx512bw_condition>" "@ pshufb\t{%2, %0|%0, %2} - vpshufb\t{%2, %1, %0|%0, %1, %2}" + vpshufb\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" [(set_attr "isa" "noavx,avx") (set_attr "type" "sselog1") (set_attr "prefix_data16" "1,*") (set_attr "prefix_extra" "1") - (set_attr "prefix" "orig,vex") + (set_attr "prefix" "orig,maybe_evex") (set_attr "btver2_decode" "vector,vector") (set_attr "mode" "<sseinsnmode>")]) @@ -16038,9 +16043,9 @@ (set_attr "mode" "TI")]) (define_expand "<shift_insn><mode>3" - [(set (match_operand:VI1_AVX2 0 "register_operand") - (any_shift:VI1_AVX2 - (match_operand:VI1_AVX2 1 "register_operand") + [(set (match_operand:VI1_AVX512 0 "register_operand") + (any_shift:VI1_AVX512 + (match_operand:VI1_AVX512 1 "register_operand") (match_operand:SI 2 "nonmemory_operand")))] "TARGET_SSE2" { -- 1.8.3.1 ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi). 2014-10-16 10:25 ` Ilya Tocar @ 2014-10-16 11:18 ` Jakub Jelinek 2014-10-20 15:20 ` Ilya Tocar 1 sibling, 0 replies; 11+ messages in thread From: Jakub Jelinek @ 2014-10-16 11:18 UTC (permalink / raw) To: Ilya Tocar; +Cc: Uros Bizjak, Kirill Yukhin, Richard Henderson, GCC Patches On Thu, Oct 16, 2014 at 02:23:16PM +0400, Ilya Tocar wrote: > On 10 Oct 18:37, Uros Bizjak wrote: > > On Fri, Oct 10, 2014 at 5:47 PM, Ilya Tocar <tocarip.intel@gmail.com> wrote: > > > > > > Please recode that horrible first switch statement to: > > > > --cut here-- > > rtx (*gen) (rtx, rtx, rtx, rtx) = NULL; > > > > switch (mode) > > { > > case V8HImode: > > if (TARGET_AVX512VL && TARGET_AVX152BW) > > gen = gen_avx512vl_vpermi2varv8hi3; > > break; > > > > ... > > > > case V2DFmode: > > if (TARGET_AVX512VL) > > { > > gen = gen_avx512vl_vpermi2varv2df3; > > maskmode = V2DImode; > > > > The patch is OK with the above improvement. > > > > Thanks, > > Uros. > > > > Will commit version below, if no objections in 24 hours. No need to wait, it is ok now (with proper ChangeLog of course). Jakub ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi). 2014-10-16 10:25 ` Ilya Tocar 2014-10-16 11:18 ` Jakub Jelinek @ 2014-10-20 15:20 ` Ilya Tocar 2014-10-20 16:59 ` Uros Bizjak 1 sibling, 1 reply; 11+ messages in thread From: Ilya Tocar @ 2014-10-20 15:20 UTC (permalink / raw) To: Uros Bizjak; +Cc: Jakub Jelinek, Kirill Yukhin, Richard Henderson, GCC Patches > > > > The patch is OK with the above improvement. > > > > > > Will commit version below, if no objections in 24 hours. > > Sorry, I've missed palignr, which should also have v64qi version, and lost return in expand_vec_perm_palignr case (this caused avx512f-vec-unpack test failures). Patch below fixes it. Ok for trunk? 2014-10-20 Ilya Tocar <ilya.tocar@intel.com> * config/i386/i386.c (expand_vec_perm_1): Fix expand_vec_perm_palignr case. * config/i386/sse.md (<ssse3_avx2>_palignr<mode>_mask): Use VI1_AVX512. --- gcc/config/i386/i386.c | 1 + gcc/config/i386/sse.md | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 33b21f4..34273ca 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -43552,6 +43552,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) /* Try the AVX2 vpalignr instruction. */ if (expand_vec_perm_palignr (d, true)) + return true; /* Try the AVX512F vpermi2 instructions. */ if (ix86_expand_vec_perm_vpermi2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d)) diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 8157045..a3f336f 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -13716,14 +13716,14 @@ (set_attr "mode" "DI")]) (define_insn "<ssse3_avx2>_palignr<mode>_mask" - [(set (match_operand:VI1_AVX2 0 "register_operand" "=v") - (vec_merge:VI1_AVX2 - (unspec:VI1_AVX2 - [(match_operand:VI1_AVX2 1 "register_operand" "v") - (match_operand:VI1_AVX2 2 "nonimmediate_operand" "vm") + [(set (match_operand:VI1_AVX512 0 "register_operand" "=v") + (vec_merge:VI1_AVX512 + (unspec:VI1_AVX512 + [(match_operand:VI1_AVX512 1 "register_operand" "v") + (match_operand:VI1_AVX512 2 "nonimmediate_operand" "vm") (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")] UNSPEC_PALIGNR) - (match_operand:VI1_AVX2 4 "vector_move_operand" "0C") + (match_operand:VI1_AVX512 4 "vector_move_operand" "0C") (match_operand:<avx512fmaskmode> 5 "register_operand" "Yk")))] "TARGET_AVX512BW && (<MODE_SIZE> == 64 || TARGET_AVX512VL)" { -- 1.8.3.1 ^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi). 2014-10-20 15:20 ` Ilya Tocar @ 2014-10-20 16:59 ` Uros Bizjak 0 siblings, 0 replies; 11+ messages in thread From: Uros Bizjak @ 2014-10-20 16:59 UTC (permalink / raw) To: Ilya Tocar; +Cc: Jakub Jelinek, Kirill Yukhin, Richard Henderson, GCC Patches On Mon, Oct 20, 2014 at 5:19 PM, Ilya Tocar <tocarip.intel@gmail.com> wrote: >> > >> > The patch is OK with the above improvement. >> > >> > >> >> Will commit version below, if no objections in 24 hours. >> >> > Sorry, > I've missed palignr, which should also have v64qi version, > and lost return in expand_vec_perm_palignr case > (this caused avx512f-vec-unpack test failures). > Patch below fixes it. Ok for trunk? > > 2014-10-20 Ilya Tocar <ilya.tocar@intel.com> > > * config/i386/i386.c (expand_vec_perm_1): Fix > expand_vec_perm_palignr case. > * config/i386/sse.md (<ssse3_avx2>_palignr<mode>_mask): Use > VI1_AVX512. OK. Thanks, Uros. ^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2014-10-20 16:08 UTC | newest] Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2014-10-06 12:55 [PATCH i386 AVX512] [63/n] Add vpshufb, perm autogen Kirill Yukhin 2014-10-06 14:10 ` Jakub Jelinek 2014-10-09 12:19 ` [PATCH i386 AVX512] [63.1/n] Add vpshufb, perm autogen (except for v64qi) Ilya Tocar 2014-10-09 18:51 ` Jakub Jelinek 2014-10-10 15:49 ` Ilya Tocar 2014-10-10 15:59 ` Jakub Jelinek 2014-10-10 16:39 ` Uros Bizjak 2014-10-16 10:25 ` Ilya Tocar 2014-10-16 11:18 ` Jakub Jelinek 2014-10-20 15:20 ` Ilya Tocar 2014-10-20 16:59 ` Uros Bizjak
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).