From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 1363) id DD7123836001; Thu, 10 Jun 2021 21:56:31 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org DD7123836001 MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset="utf-8" From: Uros Bizjak To: gcc-cvs@gcc.gnu.org Subject: [gcc r12-1372] i386: Add V8QI and other 64bit vector permutations [PR89021] X-Act-Checkin: gcc X-Git-Author: Peter Bergner X-Git-Refname: refs/heads/master X-Git-Oldrev: ee52bf609bac45b3c251858a69071262f46ee89c X-Git-Newrev: a325bdd195ee96f826b208c3afb9bed2ec077e12 Message-Id: <20210610215631.DD7123836001@sourceware.org> Date: Thu, 10 Jun 2021 21:56:31 +0000 (GMT) X-BeenThere: gcc-cvs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 10 Jun 2021 21:56:32 -0000 https://gcc.gnu.org/g:a325bdd195ee96f826b208c3afb9bed2ec077e12 commit r12-1372-ga325bdd195ee96f826b208c3afb9bed2ec077e12 Author: Peter Bergner Date: Thu Jun 10 13:54:12 2021 -0500 i386: Add V8QI and other 64bit vector permutations [PR89021] In addition to V8QI permutations, several other missing permutations are added for 64bit vector modes for TARGET_SSSE3 and TARGET_SSE4_1 targets. 2021-06-10 Uroš Bizjak gcc/ PR target/89021 * config/i386/i386-expand.c (ix86_split_mmx_punpck): Handle V2SF mode. Emit SHUFPS to fixup unpack-high for V2SF mode. (expand_vec_perm_blend): Handle 64bit modes for TARGET_SSE4_1. (expand_vec_perm_pshufb): Handle 64bit modes for TARGET_SSSE3. (expand_vec_perm_pblendv): Handle 64bit modes for TARGET_SSE4_1. (expand_vec_perm_interleave2): Handle 64bit modes. (expand_vec_perm_even_odd_pack): Handle V8QI mode. (expand_vec_perm_even_odd_1): Ditto. (ix86_vectorize_vec_perm_const): Ditto. * config/i386/i386.md (UNSPEC_PSHUFB): Move from ... * config/i386/sse.md: ... here. * config/i386/mmx.md (*vec_interleave_lowv2sf): New insn_and_split pattern. (*vec_interleave_highv2sf): Ditto. (mmx_pshufbv8qi3): New insn pattern. (*mmx_pblendw): Ditto. Diff: --- gcc/config/i386/i386-expand.c | 191 ++++++++++++++++++++++++++++++++++++------ gcc/config/i386/i386.md | 1 + gcc/config/i386/mmx.md | 86 +++++++++++++++++-- gcc/config/i386/sse.md | 1 - 4 files changed, 246 insertions(+), 33 deletions(-) diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index c3ce21b4387..9ee5257adf9 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -798,6 +798,15 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p) GEN_INT (1), GEN_INT (5))); break; + case E_V2SFmode: + sse_mode = V4SFmode; + double_sse_mode = V8SFmode; + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, + GEN_INT (0), GEN_INT (4), + GEN_INT (1), GEN_INT (5))); + break; + default: gcc_unreachable (); } @@ -812,14 +821,26 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p) rtx insn = gen_rtx_SET (dest, op2); emit_insn (insn); + /* Move bits 64:127 to bits 0:63. */ if (high_p) { - /* Move bits 64:127 to bits 0:63. */ - mask = gen_rtx_PARALLEL (VOIDmode, - gen_rtvec (4, GEN_INT (2), GEN_INT (3), - GEN_INT (0), GEN_INT (0))); - dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest)); - op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); + if (sse_mode == V4SFmode) + { + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, GEN_INT (2), GEN_INT (3), + GEN_INT (4), GEN_INT (5))); + op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest); + op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask); + } + else + { + mask = gen_rtx_PARALLEL (VOIDmode, + gen_rtvec (4, GEN_INT (2), GEN_INT (3), + GEN_INT (0), GEN_INT (1))); + dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest)); + op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask); + } + insn = gen_rtx_SET (dest, op1); emit_insn (insn); } @@ -17062,7 +17083,8 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; - else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) + else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16 + || GET_MODE_SIZE (vmode) == 8)) ; else return false; @@ -17095,6 +17117,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) case E_V8SFmode: case E_V2DFmode: case E_V4SFmode: + case E_V4HImode: case E_V8HImode: case E_V8SImode: case E_V32HImode: @@ -17111,6 +17134,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) vmode = V8HImode; goto do_subreg; + case E_V2SImode: + for (i = 0; i < 2; ++i) + mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2); + vmode = V4HImode; + goto do_subreg; + case E_V4SImode: for (i = 0; i < 4; ++i) mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); @@ -17132,7 +17161,9 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); vperm = force_reg (vmode, vperm); - if (GET_MODE_SIZE (vmode) == 16) + if (GET_MODE_SIZE (vmode) == 8) + emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm)); + else if (GET_MODE_SIZE (vmode) == 16) emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm)); else emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm)); @@ -17152,6 +17183,16 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) op1 = gen_lowpart (vmode, op1); break; + case E_V8QImode: + for (i = 0; i < 8; i += 2) + if (d->perm[i] + 1 != d->perm[i + 1]) + goto use_pblendvb; + + for (i = 0; i < 4; ++i) + mask |= (d->perm[i * 2] >= 8) << i; + vmode = V4HImode; + goto do_subreg; + case E_V32QImode: /* See if bytes move in pairs. If not, vpblendvb must be used. */ for (i = 0; i < 32; i += 2) @@ -17384,7 +17425,13 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) } else { - if (GET_MODE_SIZE (d->vmode) == 16) + if (GET_MODE_SIZE (d->vmode) == 8) + { + if (!TARGET_SSSE3) + return false; + vmode = V8QImode; + } + else if (GET_MODE_SIZE (d->vmode) == 16) { if (!TARGET_SSSE3) return false; @@ -17506,12 +17553,12 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) eltsz = GET_MODE_UNIT_SIZE (d->vmode); if (!d->one_operand_p) mask = 2 * nelt - 1; - else if (vmode == V16QImode) - mask = nelt - 1; else if (vmode == V64QImode) mask = nelt / 4 - 1; - else + else if (vmode == V32QImode) mask = nelt / 2 - 1; + else + mask = nelt - 1; for (i = 0; i < nelt; ++i) { @@ -17521,9 +17568,18 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) } } - vperm = gen_rtx_CONST_VECTOR (vmode, - gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm)); - vperm = force_reg (vmode, vperm); + machine_mode vpmode = vmode; + + if (vmode == V8QImode) + { + for (i = nelt; i < 16; ++i) + rperm[i] = constm1_rtx; + vpmode = V16QImode; + } + + vperm = gen_rtx_CONST_VECTOR (vpmode, + gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm)); + vperm = force_reg (vpmode, vperm); target = d->target; if (d->vmode != vmode) @@ -17531,7 +17587,9 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d) op0 = gen_lowpart (vmode, d->op0); if (d->one_operand_p) { - if (vmode == V16QImode) + if (vmode == V8QImode) + emit_insn (gen_mmx_pshufbv8qi3 (target, op0, vperm)); + else if (vmode == V16QImode) emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); else if (vmode == V32QImode) emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); @@ -18041,7 +18099,8 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d *d) ; else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) ; - else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) + else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 8 + || GET_MODE_SIZE (vmode) == 16)) ; else return false; @@ -18120,7 +18179,8 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) rtx_insn *seq; bool ok, same_halves = false; - if (GET_MODE_SIZE (d->vmode) == 16) + if (GET_MODE_SIZE (d->vmode) == 8 + || GET_MODE_SIZE (d->vmode) == 16) { if (d->one_operand_p) return false; @@ -18155,7 +18215,44 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d) memset (remap, 0xff, sizeof (remap)); dremap = *d; - if (GET_MODE_SIZE (d->vmode) == 16) + if (GET_MODE_SIZE (d->vmode) == 8) + { + unsigned HOST_WIDE_INT h1, h2, h3, h4; + + /* Split the two input vectors into 4 halves. */ + h1 = (HOST_WIDE_INT_1U << nelt2) - 1; + h2 = h1 << nelt2; + h3 = h2 << nelt2; + h4 = h3 << nelt2; + + /* If the elements from the low halves use interleave low, + and similarly for interleave high. */ + if ((contents & (h1 | h3)) == contents) + { + /* punpckl* */ + for (i = 0; i < nelt2; ++i) + { + remap[i] = i * 2; + remap[i + nelt] = i * 2 + 1; + dremap.perm[i * 2] = i; + dremap.perm[i * 2 + 1] = i + nelt; + } + } + else if ((contents & (h2 | h4)) == contents) + { + /* punpckh* */ + for (i = 0; i < nelt2; ++i) + { + remap[i + nelt2] = i * 2; + remap[i + nelt + nelt2] = i * 2 + 1; + dremap.perm[i * 2] = i + nelt2; + dremap.perm[i * 2 + 1] = i + nelt + nelt2; + } + } + else + return false; + } + else if (GET_MODE_SIZE (d->vmode) == 16) { unsigned HOST_WIDE_INT h1, h2, h3, h4; @@ -19328,9 +19425,9 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) } /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even - and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands - with two "and" and "pack" or two "shift" and "pack" insns. We should - have already failed all two instruction sequences. */ + and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI + operands with two "and" and "pack" or two "shift" and "pack" insns. + We should have already failed all two instruction sequences. */ static bool expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) @@ -19359,6 +19456,15 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) gen_pack = gen_sse4_1_packusdw; gen_shift = gen_lshrv4si3; break; + case E_V8QImode: + /* No check as all instructions are SSE2. */ + c = 0xff; + s = 8; + half_mode = V4HImode; + gen_and = gen_andv4hi3; + gen_pack = gen_mmx_packuswb; + gen_shift = gen_lshrv4hi3; + break; case E_V16QImode: /* No check as all instructions are SSE2. */ c = 0xff; @@ -19391,8 +19497,8 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d) end_perm = true; break; default: - /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than - general shuffles. */ + /* Only V8QI, V8HI, V16QI, V16HI and V32QI modes + are more profitable than general shuffles. */ return false; } @@ -19621,6 +19727,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) } break; + case E_V8QImode: case E_V16QImode: return expand_vec_perm_even_odd_pack (d); @@ -19786,6 +19893,41 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d) /* These are always implementable using standard shuffle patterns. */ gcc_unreachable (); + case E_V8QImode: + /* These can be implemented via interleave. We save one insn by + stopping once we have promoted to V2SImode and then use pshufd. */ + if (d->testing_p) + return true; + do + { + rtx dest; + rtx (*gen) (rtx, rtx, rtx) + = vmode == V8QImode ? gen_mmx_punpcklbw + : gen_mmx_punpcklwd; + + if (elt >= nelt2) + { + gen = vmode == V8QImode ? gen_mmx_punpckhbw + : gen_mmx_punpckhwd; + elt -= nelt2; + } + nelt2 /= 2; + + dest = gen_reg_rtx (vmode); + emit_insn (gen (dest, op0, op0)); + vmode = get_mode_wider_vector (vmode); + op0 = gen_lowpart (vmode, dest); + } + while (vmode != V2SImode); + + memset (perm2, elt, 2); + dest = gen_reg_rtx (V2SImode); + ok = expand_vselect (dest, op0, perm2, 2, d->testing_p); + gcc_assert (ok); + if (!d->testing_p) + emit_move_insn (d->target, gen_lowpart (d->vmode, dest)); + return true; + case E_V8HImode: case E_V16QImode: /* These can be implemented via interleave. We save one insn by @@ -20289,6 +20431,7 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0, case E_V2SFmode: case E_V2SImode: case E_V4HImode: + case E_V8QImode: if (!TARGET_MMX_WITH_SSE) return false; break; diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 5ff49ec2f1c..7743c61ec86 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -119,6 +119,7 @@ UNSPEC_MASKMOV UNSPEC_MOVMSK UNSPEC_BLENDV + UNSPEC_PSHUFB UNSPEC_RCP UNSPEC_RSQRT UNSPEC_PSADBW diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md index 0a17a54fad5..f9e7d2786c6 100644 --- a/gcc/config/i386/mmx.md +++ b/gcc/config/i386/mmx.md @@ -1198,6 +1198,40 @@ (set_attr "prefix" "maybe_vex,orig") (set_attr "mode" "V4SF")]) +(define_insn_and_split "*vec_interleave_lowv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=x,v") + (vec_select:V2SF + (vec_concat:V4SF + (match_operand:V2SF 1 "register_operand" "0,v") + (match_operand:V2SF 2 "register_operand" "x,v")) + (parallel [(const_int 0) (const_int 2)])))] + "TARGET_MMX_WITH_SSE" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_mmx_punpck (operands, false); DONE;" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog") + (set_attr "prefix" "orig,maybe_evex") + (set_attr "mode" "V4SF")]) + +(define_insn_and_split "*vec_interleave_highv2sf" + [(set (match_operand:V2SF 0 "register_operand" "=x,v") + (vec_select:V2SF + (vec_concat:V4SF + (match_operand:V2SF 1 "register_operand" "0,v") + (match_operand:V2SF 2 "register_operand" "x,v")) + (parallel [(const_int 1) (const_int 3)])))] + "TARGET_MMX_WITH_SSE" + "#" + "&& reload_completed" + [(const_int 0)] + "ix86_split_mmx_punpck (operands, true); DONE;" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog") + (set_attr "prefix" "orig,vex") + (set_attr "mode" "V4SF")]) + (define_insn "*vec_dupv2sf" [(set (match_operand:V2SF 0 "register_operand" "=y,Yv,x") (vec_duplicate:V2SF @@ -2415,7 +2449,7 @@ packswb\t{%2, %0|%0, %2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_pack (operands, ); DONE;" @@ -2435,7 +2469,7 @@ packssdw\t{%2, %0|%0, %2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_pack (operands, SS_TRUNCATE); DONE;" @@ -2458,7 +2492,7 @@ punpckhbw\t{%2, %0|%0, %2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, true); DONE;" @@ -2481,7 +2515,7 @@ punpcklbw\t{%2, %0|%0, %k2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, false); DONE;" @@ -2502,7 +2536,7 @@ punpckhwd\t{%2, %0|%0, %2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, true); DONE;" @@ -2523,7 +2557,7 @@ punpcklwd\t{%2, %0|%0, %k2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, false); DONE;" @@ -2544,7 +2578,7 @@ punpckhdq\t{%2, %0|%0, %2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, true); DONE;" @@ -2565,7 +2599,7 @@ punpckldq\t{%2, %0|%0, %k2} # #" - "TARGET_SSE2 && reload_completed + "&& reload_completed && SSE_REGNO_P (REGNO (operands[0]))" [(const_int 0)] "ix86_split_mmx_punpck (operands, false); DONE;" @@ -2756,6 +2790,24 @@ (set_attr "prefix" "maybe_vex") (set_attr "mode" "TI")]) +(define_insn "mmx_pshufbv8qi3" + [(set (match_operand:V8QI 0 "register_operand" "=x,Yw") + (unspec:V8QI + [(match_operand:V8QI 1 "register_operand" "0,Yw") + (match_operand:V16QI 2 "vector_operand" "xBm,Ywm")] + UNSPEC_PSHUFB))] + "TARGET_SSSE3 && TARGET_MMX_WITH_SSE" + "@ + pshufb\t{%2, %0|%0, %2} + vpshufb\t{%2, %1, %0|%0, %1, %2}" + [(set_attr "isa" "noavx,avx") + (set_attr "type" "sselog1") + (set_attr "prefix_data16" "1,*") + (set_attr "prefix_extra" "1") + (set_attr "prefix" "orig,maybe_evex") + (set_attr "btver2_decode" "vector") + (set_attr "mode" "TI")]) + (define_expand "mmx_pshufw" [(match_operand:V4HI 0 "register_operand") (match_operand:V4HI 1 "register_mmxmem_operand") @@ -2828,6 +2880,24 @@ (set_attr "length_immediate" "1") (set_attr "mode" "TI")]) +(define_insn "*mmx_pblendw" + [(set (match_operand:V4HI 0 "register_operand" "=Yr,*x,x") + (vec_merge:V4HI + (match_operand:V4HI 2 "register_operand" "Yr,*x,x") + (match_operand:V4HI 1 "register_operand" "0,0,x") + (match_operand:SI 3 "const_0_to_63_operand" "n,n,n")))] + "TARGET_SSE4_1 && TARGET_MMX_WITH_SSE" + "@ + pblendw\t{%3, %2, %0|%0, %2, %3} + pblendw\t{%3, %2, %0|%0, %2, %3} + vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}" + [(set_attr "isa" "noavx,noavx,avx") + (set_attr "type" "ssemov") + (set_attr "prefix_extra" "1") + (set_attr "length_immediate" "1") + (set_attr "prefix" "orig,orig,vex") + (set_attr "mode" "TI")]) + ;; Optimize V2SImode load from memory, swapping the elements and ;; storing back into the memory into DImode rotate of the memory by 32. (define_split diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 2a34756be2a..8403a07839f 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -28,7 +28,6 @@ UNSPEC_LDDQU ;; SSSE3 - UNSPEC_PSHUFB UNSPEC_PSIGN UNSPEC_PALIGNR