From: chenglulu <chenglulu@loongson.cn>
To: Li Wei <liwei@loongson.cn>, gcc-patches@gcc.gnu.org
Cc: xry111@xry111.site, i@xen0n.name, xuchenghua@loongson.cn
Subject: Re:[pushed] [PATCH v2] LoongArch: Merge constant vector permuatation implementations.
Date: Thu, 4 Jan 2024 14:18:57 +0800 [thread overview]
Message-ID: <da9824bc-a269-501e-ec48-cba0e01af3ad@loongson.cn> (raw)
In-Reply-To: <20231228122646.2594388-1-liwei@loongson.cn>
Pushed to r14-6908.
在 2023/12/28 下午8:26, Li Wei 写道:
> There are currently two versions of the implementations of constant
> vector permutation: loongarch_expand_vec_perm_const_1 and
> loongarch_expand_vec_perm_const_2. The implementations of the two
> versions are different. Currently, only the implementation of
> loongarch_expand_vec_perm_const_1 is used for 256-bit vectors. We
> hope to streamline the code as much as possible while retaining the
> better-performing implementation of the two. By repeatedly testing
> spec2006 and spec2017, we got the following Merged version.
> Compared with the pre-merger version, the number of lines of code
> in loongarch.cc has been reduced by 888 lines. At the same time,
> the performance of SPECint2006 under Ofast has been improved by 0.97%,
> and the performance of SPEC2017 fprate has been improved by 0.27%.
>
> gcc/ChangeLog:
>
> * config/loongarch/loongarch.cc (loongarch_is_odd_extraction):
> Remove useless forward declaration.
> (loongarch_is_even_extraction): Remove useless forward declaration.
> (loongarch_try_expand_lsx_vshuf_const): Removed.
> (loongarch_expand_vec_perm_const_1): Merged.
> (loongarch_is_double_duplicate): Removed.
> (loongarch_is_center_extraction): Ditto.
> (loongarch_is_reversing_permutation): Ditto.
> (loongarch_is_di_misalign_extract): Ditto.
> (loongarch_is_si_misalign_extract): Ditto.
> (loongarch_is_lasx_lowpart_extract): Ditto.
> (loongarch_is_op_reverse_perm): Ditto.
> (loongarch_is_single_op_perm): Ditto.
> (loongarch_is_divisible_perm): Ditto.
> (loongarch_is_triple_stride_extract): Ditto.
> (loongarch_expand_vec_perm_const_2): Merged.
> (loongarch_expand_vec_perm_const): New.
> (loongarch_vectorize_vec_perm_const): Adjust.
> ---
> gcc/config/loongarch/loongarch.cc | 1308 +++++------------------------
> 1 file changed, 210 insertions(+), 1098 deletions(-)
>
> diff --git a/gcc/config/loongarch/loongarch.cc b/gcc/config/loongarch/loongarch.cc
> index 1d4d8f0b256..d5bf6a02a12 100644
> --- a/gcc/config/loongarch/loongarch.cc
> +++ b/gcc/config/loongarch/loongarch.cc
> @@ -8769,143 +8769,6 @@ loongarch_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
> }
> }
>
> -static bool
> -loongarch_is_odd_extraction (struct expand_vec_perm_d *);
> -
> -static bool
> -loongarch_is_even_extraction (struct expand_vec_perm_d *);
> -
> -static bool
> -loongarch_try_expand_lsx_vshuf_const (struct expand_vec_perm_d *d)
> -{
> - int i;
> - rtx target, op0, op1, sel, tmp;
> - rtx rperm[MAX_VECT_LEN];
> -
> - if (d->vmode == E_V2DImode || d->vmode == E_V2DFmode
> - || d->vmode == E_V4SImode || d->vmode == E_V4SFmode
> - || d->vmode == E_V8HImode || d->vmode == E_V16QImode)
> - {
> - target = d->target;
> - op0 = d->op0;
> - op1 = d->one_vector_p ? d->op0 : d->op1;
> -
> - if (GET_MODE (op0) != GET_MODE (op1)
> - || GET_MODE (op0) != GET_MODE (target))
> - return false;
> -
> - if (d->testing_p)
> - return true;
> -
> - /* If match extract-even and extract-odd permutations pattern, use
> - * vselect much better than vshuf. */
> - if (loongarch_is_odd_extraction (d)
> - || loongarch_is_even_extraction (d))
> - {
> - if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1,
> - d->perm, d->nelt))
> - return true;
> -
> - unsigned char perm2[MAX_VECT_LEN];
> - for (i = 0; i < d->nelt; ++i)
> - perm2[i] = (d->perm[i] + d->nelt) & (2 * d->nelt - 1);
> -
> - if (loongarch_expand_vselect_vconcat (d->target, d->op1, d->op0,
> - perm2, d->nelt))
> - return true;
> - }
> -
> - for (i = 0; i < d->nelt; i += 1)
> - {
> - rperm[i] = GEN_INT (d->perm[i]);
> - }
> -
> - if (d->vmode == E_V2DFmode)
> - {
> - sel = gen_rtx_CONST_VECTOR (E_V2DImode, gen_rtvec_v (d->nelt, rperm));
> - tmp = simplify_gen_subreg (E_V2DImode, d->target, d->vmode, 0);
> - emit_move_insn (tmp, sel);
> - }
> - else if (d->vmode == E_V4SFmode)
> - {
> - sel = gen_rtx_CONST_VECTOR (E_V4SImode, gen_rtvec_v (d->nelt, rperm));
> - tmp = simplify_gen_subreg (E_V4SImode, d->target, d->vmode, 0);
> - emit_move_insn (tmp, sel);
> - }
> - else
> - {
> - sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm));
> - emit_move_insn (d->target, sel);
> - }
> -
> - switch (d->vmode)
> - {
> - case E_V2DFmode:
> - emit_insn (gen_lsx_vshuf_d_f (target, target, op1, op0));
> - break;
> - case E_V2DImode:
> - emit_insn (gen_lsx_vshuf_d (target, target, op1, op0));
> - break;
> - case E_V4SFmode:
> - emit_insn (gen_lsx_vshuf_w_f (target, target, op1, op0));
> - break;
> - case E_V4SImode:
> - emit_insn (gen_lsx_vshuf_w (target, target, op1, op0));
> - break;
> - case E_V8HImode:
> - emit_insn (gen_lsx_vshuf_h (target, target, op1, op0));
> - break;
> - case E_V16QImode:
> - emit_insn (gen_lsx_vshuf_b (target, op1, op0, target));
> - break;
> - default:
> - break;
> - }
> -
> - return true;
> - }
> - return false;
> -}
> -
> -static bool
> -loongarch_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
> -{
> - unsigned int i, nelt = d->nelt;
> - unsigned char perm2[MAX_VECT_LEN];
> -
> - if (d->one_vector_p)
> - {
> - /* Try interleave with alternating operands. */
> - memcpy (perm2, d->perm, sizeof (perm2));
> - for (i = 1; i < nelt; i += 2)
> - perm2[i] += nelt;
> - if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1, perm2,
> - nelt))
> - return true;
> - }
> - else
> - {
> - if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1,
> - d->perm, nelt))
> - return true;
> -
> - /* Try again with swapped operands. */
> - for (i = 0; i < nelt; ++i)
> - perm2[i] = (d->perm[i] + nelt) & (2 * nelt - 1);
> - if (loongarch_expand_vselect_vconcat (d->target, d->op1, d->op0, perm2,
> - nelt))
> - return true;
> - }
> -
> - if (loongarch_expand_lsx_shuffle (d))
> - return true;
> - if (loongarch_expand_vec_perm_even_odd (d))
> - return true;
> - if (loongarch_expand_vec_perm_interleave (d))
> - return true;
> - return false;
> -}
> -
> /* Following are the assist function for const vector permutation support. */
> static bool
> loongarch_is_quad_duplicate (struct expand_vec_perm_d *d)
> @@ -8937,36 +8800,6 @@ loongarch_is_quad_duplicate (struct expand_vec_perm_d *d)
> return result;
> }
>
> -static bool
> -loongarch_is_double_duplicate (struct expand_vec_perm_d *d)
> -{
> - if (!d->one_vector_p)
> - return false;
> -
> - if (d->nelt < 8)
> - return false;
> -
> - bool result = true;
> - unsigned char buf = d->perm[0];
> -
> - for (int i = 1; i < d->nelt; i += 2)
> - {
> - if (d->perm[i] != buf)
> - {
> - result = false;
> - break;
> - }
> - if (d->perm[i - 1] != d->perm[i])
> - {
> - result = false;
> - break;
> - }
> - buf += d->nelt / 4;
> - }
> -
> - return result;
> -}
> -
> static bool
> loongarch_is_odd_extraction (struct expand_vec_perm_d *d)
> {
> @@ -9027,110 +8860,6 @@ loongarch_is_extraction_permutation (struct expand_vec_perm_d *d)
> return result;
> }
>
> -static bool
> -loongarch_is_center_extraction (struct expand_vec_perm_d *d)
> -{
> - bool result = true;
> - unsigned buf = d->nelt / 2;
> -
> - for (int i = 0; i < d->nelt; i += 1)
> - {
> - if (buf != d->perm[i])
> - {
> - result = false;
> - break;
> - }
> - buf += 1;
> - }
> -
> - return result;
> -}
> -
> -static bool
> -loongarch_is_reversing_permutation (struct expand_vec_perm_d *d)
> -{
> - if (!d->one_vector_p)
> - return false;
> -
> - bool result = true;
> - unsigned char buf = d->nelt - 1;
> -
> - for (int i = 0; i < d->nelt; i += 1)
> - {
> - if (d->perm[i] != buf)
> - {
> - result = false;
> - break;
> - }
> -
> - buf -= 1;
> - }
> -
> - return result;
> -}
> -
> -static bool
> -loongarch_is_di_misalign_extract (struct expand_vec_perm_d *d)
> -{
> - if (d->nelt != 4 && d->nelt != 8)
> - return false;
> -
> - bool result = true;
> - unsigned char buf;
> -
> - if (d->nelt == 4)
> - {
> - buf = 1;
> - for (int i = 0; i < d->nelt; i += 1)
> - {
> - if (buf != d->perm[i])
> - {
> - result = false;
> - break;
> - }
> -
> - buf += 1;
> - }
> - }
> - else if (d->nelt == 8)
> - {
> - buf = 2;
> - for (int i = 0; i < d->nelt; i += 1)
> - {
> - if (buf != d->perm[i])
> - {
> - result = false;
> - break;
> - }
> -
> - buf += 1;
> - }
> - }
> -
> - return result;
> -}
> -
> -static bool
> -loongarch_is_si_misalign_extract (struct expand_vec_perm_d *d)
> -{
> - if (d->vmode != E_V8SImode && d->vmode != E_V8SFmode)
> - return false;
> - bool result = true;
> - unsigned char buf = 1;
> -
> - for (int i = 0; i < d->nelt; i += 1)
> - {
> - if (buf != d->perm[i])
> - {
> - result = false;
> - break;
> - }
> - buf += 1;
> - }
> -
> - return result;
> -}
> -
> static bool
> loongarch_is_lasx_lowpart_interleave (struct expand_vec_perm_d *d)
> {
> @@ -9193,39 +8922,6 @@ loongarch_is_lasx_lowpart_interleave_2 (struct expand_vec_perm_d *d)
> return result;
> }
>
> -static bool
> -loongarch_is_lasx_lowpart_extract (struct expand_vec_perm_d *d)
> -{
> - bool result = true;
> - unsigned char buf = 0;
> -
> - for (int i = 0; i < d->nelt / 2; i += 1)
> - {
> - if (buf != d->perm[i])
> - {
> - result = false;
> - break;
> - }
> - buf += 1;
> - }
> -
> - if (result)
> - {
> - buf = d->nelt;
> - for (int i = d->nelt / 2; i < d->nelt; i += 1)
> - {
> - if (buf != d->perm[i])
> - {
> - result = false;
> - break;
> - }
> - buf += 1;
> - }
> - }
> -
> - return result;
> -}
> -
> static bool
> loongarch_is_lasx_highpart_interleave (expand_vec_perm_d *d)
> {
> @@ -9307,538 +9003,195 @@ loongarch_is_elem_duplicate (struct expand_vec_perm_d *d)
> return result;
> }
>
> -inline bool
> -loongarch_is_op_reverse_perm (struct expand_vec_perm_d *d)
> -{
> - return (d->vmode == E_V4DFmode)
> - && d->perm[0] == 2 && d->perm[1] == 3
> - && d->perm[2] == 0 && d->perm[3] == 1;
> -}
> +/* In LASX, some permutation insn does not have the behavior that gcc expects
> + when compiler wants to emit a vector permutation.
> +
> + 1. What GCC provides via vectorize_vec_perm_const ()'s paramater:
> + When GCC wants to performs a vector permutation, it provides two op
> + reigster, one target register, and a selector.
> + In const vector permutation case, GCC provides selector as a char array
> + that contains original value; in variable vector permuatation
> + (performs via vec_perm<mode> insn template), it provides a vector register.
> + We assume that nelt is the elements numbers inside single vector in current
> + 256bit vector mode.
> +
> + 2. What GCC expects to perform:
> + Two op registers (op0, op1) will "combine" into a 512bit temp vector storage
> + that has 2*nelt elements inside it; the low 256bit is op0, and high 256bit
> + is op1, then the elements are indexed as below:
> + 0 ~ nelt - 1 nelt ~ 2 * nelt - 1
> + |-------------------------|-------------------------|
> + Low 256bit (op0) High 256bit (op1)
> + For example, the second element in op1 (V8SImode) will be indexed with 9.
> + Selector is a vector that has the same mode and number of elements with
> + op0,op1 and target, it's look like this:
> + 0 ~ nelt - 1
> + |-------------------------|
> + 256bit (selector)
> + It describes which element from 512bit temp vector storage will fit into
> + target's every element slot.
> + GCC expects that every element in selector can be ANY indices of 512bit
> + vector storage (Selector can pick literally any element from op0 and op1, and
> + then fits into any place of target register). This is also what LSX 128bit
> + vshuf.* instruction do similarly, so we can handle 128bit vector permutation
> + by single instruction easily.
> +
> + 3. What LASX permutation instruction does:
> + In short, it just execute two independent 128bit vector permuatation, and
> + it's the reason that we need to do the jobs below. We will explain it.
> + op0, op1, target, and selector will be separate into high 128bit and low
> + 128bit, and do permutation as the description below:
> +
> + a) op0's low 128bit and op1's low 128bit "combines" into a 256bit temp
> + vector storage (TVS1), elements are indexed as below:
> + 0 ~ nelt / 2 - 1 nelt / 2 ~ nelt - 1
> + |---------------------|---------------------| TVS1
> + op0's low 128bit op1's low 128bit
> + op0's high 128bit and op1's high 128bit are "combined" into TVS2 in the
> + same way.
> + 0 ~ nelt / 2 - 1 nelt / 2 ~ nelt - 1
> + |---------------------|---------------------| TVS2
> + op0's high 128bit op1's high 128bit
> + b) Selector's low 128bit describes which elements from TVS1 will fit into
> + target vector's low 128bit. No TVS2 elements are allowed.
> + c) Selector's high 128bit describes which elements from TVS2 will fit into
> + target vector's high 128bit. No TVS1 elements are allowed.
> +
> + As we can see, if we want to handle vector permutation correctly, we can
> + achieve it in three ways:
> + a) Modify selector's elements, to make sure that every elements can inform
> + correct value that will put into target vector.
> + b) Generate extra instruction before/after permutation instruction, for
> + adjusting op vector or target vector, to make sure target vector's value is
> + what GCC expects.
> + c) Use other instructions to process op and put correct result into target.
> + */
> +
> +/* Implementation of constant vector permuatation. This function identifies
> + recognized pattern of permuation selector argument, and use one or more
> + instruction (s) to finish the permutation job correctly. For unsupported
> + patterns, it will return false. */
>
> static bool
> -loongarch_is_single_op_perm (struct expand_vec_perm_d *d)
> +loongarch_expand_vec_perm_const (struct expand_vec_perm_d *d)
> {
> - bool result = true;
> + bool flag = false;
> + unsigned int i;
> + unsigned char idx;
> + rtx target, op0, op1, sel, tmp;
> + rtx rperm[MAX_VECT_LEN];
> + unsigned int remapped[MAX_VECT_LEN];
> + unsigned char perm2[MAX_VECT_LEN];
>
> - for (int i = 0; i < d->nelt; i += 1)
> + if (GET_MODE_SIZE (d->vmode) == 16)
> + return loongarch_expand_lsx_shuffle (d);
> + else
> {
> - if (d->perm[i] >= d->nelt)
> + if (d->one_vector_p)
> {
> - result = false;
> - break;
> + /* Try interleave with alternating operands. */
> + memcpy (perm2, d->perm, sizeof (perm2));
> + for (i = 1; i < d->nelt; i += 2)
> + perm2[i] += d->nelt;
> + if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1,
> + perm2, d->nelt))
> + return true;
> }
> - }
> -
> - return result;
> -}
> -
> -static bool
> -loongarch_is_divisible_perm (struct expand_vec_perm_d *d)
> -{
> - bool result = true;
> -
> - for (int i = 0; i < d->nelt / 2; i += 1)
> - {
> - if (d->perm[i] >= d->nelt)
> + else
> {
> - result = false;
> - break;
> - }
> - }
> -
> - if (result)
> - {
> - for (int i = d->nelt / 2; i < d->nelt; i += 1)
> - {
> - if (d->perm[i] < d->nelt)
> - {
> - result = false;
> - break;
> - }
> - }
> - }
> -
> - return result;
> -}
> -
> -inline bool
> -loongarch_is_triple_stride_extract (struct expand_vec_perm_d *d)
> -{
> - return (d->vmode == E_V4DImode || d->vmode == E_V4DFmode)
> - && d->perm[0] == 1 && d->perm[1] == 4
> - && d->perm[2] == 7 && d->perm[3] == 0;
> -}
> -
> -/* In LASX, some permutation insn does not have the behavior that gcc expects
> - * when compiler wants to emit a vector permutation.
> - *
> - * 1. What GCC provides via vectorize_vec_perm_const ()'s paramater:
> - * When GCC wants to performs a vector permutation, it provides two op
> - * reigster, one target register, and a selector.
> - * In const vector permutation case, GCC provides selector as a char array
> - * that contains original value; in variable vector permuatation
> - * (performs via vec_perm<mode> insn template), it provides a vector register.
> - * We assume that nelt is the elements numbers inside single vector in current
> - * 256bit vector mode.
> - *
> - * 2. What GCC expects to perform:
> - * Two op registers (op0, op1) will "combine" into a 512bit temp vector storage
> - * that has 2*nelt elements inside it; the low 256bit is op0, and high 256bit
> - * is op1, then the elements are indexed as below:
> - * 0 ~ nelt - 1 nelt ~ 2 * nelt - 1
> - * |-------------------------|-------------------------|
> - * Low 256bit (op0) High 256bit (op1)
> - * For example, the second element in op1 (V8SImode) will be indexed with 9.
> - * Selector is a vector that has the same mode and number of elements with
> - * op0,op1 and target, it's look like this:
> - * 0 ~ nelt - 1
> - * |-------------------------|
> - * 256bit (selector)
> - * It describes which element from 512bit temp vector storage will fit into
> - * target's every element slot.
> - * GCC expects that every element in selector can be ANY indices of 512bit
> - * vector storage (Selector can pick literally any element from op0 and op1, and
> - * then fits into any place of target register). This is also what LSX 128bit
> - * vshuf.* instruction do similarly, so we can handle 128bit vector permutation
> - * by single instruction easily.
> - *
> - * 3. What LASX permutation instruction does:
> - * In short, it just execute two independent 128bit vector permuatation, and
> - * it's the reason that we need to do the jobs below. We will explain it.
> - * op0, op1, target, and selector will be separate into high 128bit and low
> - * 128bit, and do permutation as the description below:
> - *
> - * a) op0's low 128bit and op1's low 128bit "combines" into a 256bit temp
> - * vector storage (TVS1), elements are indexed as below:
> - * 0 ~ nelt / 2 - 1 nelt / 2 ~ nelt - 1
> - * |---------------------|---------------------| TVS1
> - * op0's low 128bit op1's low 128bit
> - * op0's high 128bit and op1's high 128bit are "combined" into TVS2 in the
> - * same way.
> - * 0 ~ nelt / 2 - 1 nelt / 2 ~ nelt - 1
> - * |---------------------|---------------------| TVS2
> - * op0's high 128bit op1's high 128bit
> - * b) Selector's low 128bit describes which elements from TVS1 will fit into
> - * target vector's low 128bit. No TVS2 elements are allowed.
> - * c) Selector's high 128bit describes which elements from TVS2 will fit into
> - * target vector's high 128bit. No TVS1 elements are allowed.
> - *
> - * As we can see, if we want to handle vector permutation correctly, we can
> - * achieve it in three ways:
> - * a) Modify selector's elements, to make sure that every elements can inform
> - * correct value that will put into target vector.
> - b) Generate extra instruction before/after permutation instruction, for
> - adjusting op vector or target vector, to make sure target vector's value is
> - what GCC expects.
> - c) Use other instructions to process op and put correct result into target.
> - */
> -
> -/* Implementation of constant vector permuatation. This function identifies
> - * recognized pattern of permuation selector argument, and use one or more
> - * instruction(s) to finish the permutation job correctly. For unsupported
> - * patterns, it will return false. */
> -
> -static bool
> -loongarch_expand_vec_perm_const_2 (struct expand_vec_perm_d *d)
> -{
> - /* Although we have the LSX vec_perm<mode> template, there's still some
> - 128bit vector permuatation operations send to vectorize_vec_perm_const.
> - In this case, we just simpliy wrap them by single vshuf.* instruction,
> - because LSX vshuf.* instruction just have the same behavior that GCC
> - expects. */
> - if (GET_MODE_SIZE (d->vmode) == 16)
> - return loongarch_try_expand_lsx_vshuf_const (d);
> - else
> - return false;
> -
> - bool ok = false, reverse_hi_lo = false, extract_ev_od = false,
> - use_alt_op = false;
> - unsigned char idx;
> - int i;
> - rtx target, op0, op1, sel, tmp;
> - rtx op0_alt = NULL_RTX, op1_alt = NULL_RTX;
> - rtx rperm[MAX_VECT_LEN];
> - unsigned int remapped[MAX_VECT_LEN];
> -
> - /* Try to figure out whether is a recognized permutation selector pattern, if
> - yes, we will reassign some elements with new value in selector argument,
> - and in some cases we will generate some assist insn to complete the
> - permutation. (Even in some cases, we use other insn to impl permutation
> - instead of xvshuf!)
> + if (loongarch_expand_vselect_vconcat (d->target, d->op0, d->op1,
> + d->perm, d->nelt))
> + return true;
>
> - Make sure to check d->testing_p is false everytime if you want to emit new
> - insn, unless you want to crash into ICE directly. */
> - if (loongarch_is_quad_duplicate (d))
> - {
> - /* Selector example: E_V8SImode, { 0, 0, 0, 0, 4, 4, 4, 4 }
> - copy first elem from original selector to all elem in new selector. */
> - idx = d->perm[0];
> - for (i = 0; i < d->nelt; i += 1)
> - {
> - remapped[i] = idx;
> - }
> - /* Selector after: { 0, 0, 0, 0, 0, 0, 0, 0 }. */
> - }
> - else if (loongarch_is_double_duplicate (d))
> - {
> - /* Selector example: E_V8SImode, { 1, 1, 3, 3, 5, 5, 7, 7 }
> - one_vector_p == true. */
> - for (i = 0; i < d->nelt / 2; i += 1)
> - {
> - idx = d->perm[i];
> - remapped[i] = idx;
> - remapped[i + d->nelt / 2] = idx;
> + /* Try again with swapped operands. */
> + for (i = 0; i < d->nelt; ++i)
> + perm2[i] = (d->perm[i] + d->nelt) & (2 * d->nelt - 1);
> + if (loongarch_expand_vselect_vconcat (d->target, d->op1, d->op0,
> + perm2, d->nelt))
> + return true;
> }
> - /* Selector after: { 1, 1, 3, 3, 1, 1, 3, 3 }. */
> - }
> - else if (loongarch_is_odd_extraction (d)
> - || loongarch_is_even_extraction (d))
> - {
> - /* Odd extraction selector sample: E_V4DImode, { 1, 3, 5, 7 }
> - Selector after: { 1, 3, 1, 3 }.
> - Even extraction selector sample: E_V4DImode, { 0, 2, 4, 6 }
> - Selector after: { 0, 2, 0, 2 }. */
>
> - /* Better implement of extract-even and extract-odd permutations. */
> - if (loongarch_expand_vec_perm_even_odd (d))
> + if (loongarch_expand_lsx_shuffle (d))
> return true;
>
> - for (i = 0; i < d->nelt / 2; i += 1)
> - {
> - idx = d->perm[i];
> - remapped[i] = idx;
> - remapped[i + d->nelt / 2] = idx;
> - }
> - /* Additional insn is required for correct result. See codes below. */
> - extract_ev_od = true;
> - }
> - else if (loongarch_is_extraction_permutation (d))
> - {
> - /* Selector sample: E_V8SImode, { 0, 1, 2, 3, 4, 5, 6, 7 }. */
> - if (d->perm[0] == 0)
> + if (loongarch_is_odd_extraction (d)
> + || loongarch_is_even_extraction (d))
> {
> - for (i = 0; i < d->nelt / 2; i += 1)
> - {
> - remapped[i] = i;
> - remapped[i + d->nelt / 2] = i;
> - }
> + if (loongarch_expand_vec_perm_even_odd (d))
> + return true;
> }
> - else
> +
> + if (loongarch_is_lasx_lowpart_interleave (d)
> + || loongarch_is_lasx_lowpart_interleave_2 (d)
> + || loongarch_is_lasx_highpart_interleave (d)
> + || loongarch_is_lasx_highpart_interleave_2 (d))
> {
> - /* { 8, 9, 10, 11, 12, 13, 14, 15 }. */
> - for (i = 0; i < d->nelt / 2; i += 1)
> - {
> - idx = i + d->nelt / 2;
> - remapped[i] = idx;
> - remapped[i + d->nelt / 2] = idx;
> - }
> + if (loongarch_expand_vec_perm_interleave (d))
> + return true;
> }
> - /* Selector after: { 0, 1, 2, 3, 0, 1, 2, 3 }
> - { 8, 9, 10, 11, 8, 9, 10, 11 } */
> - }
> - else if (loongarch_is_center_extraction (d))
> - {
> - /* sample: E_V4DImode, { 2, 3, 4, 5 }
> - In this condition, we can just copy high 128bit of op0 and low 128bit
> - of op1 to the target register by using xvpermi.q insn. */
> - if (!d->testing_p)
> +
> + if (loongarch_is_quad_duplicate (d))
> {
> - emit_move_insn (d->target, d->op1);
> - switch (d->vmode)
> + if (d->testing_p)
> + return true;
> + /* Selector example: E_V8SImode, { 0, 0, 0, 0, 4, 4, 4, 4 }. */
> + for (i = 0; i < d->nelt; i += 1)
> {
> - case E_V4DImode:
> - emit_insn (gen_lasx_xvpermi_q_v4di (d->target, d->target,
> - d->op0, GEN_INT (0x21)));
> - break;
> - case E_V4DFmode:
> - emit_insn (gen_lasx_xvpermi_q_v4df (d->target, d->target,
> - d->op0, GEN_INT (0x21)));
> - break;
> - case E_V8SImode:
> - emit_insn (gen_lasx_xvpermi_q_v8si (d->target, d->target,
> - d->op0, GEN_INT (0x21)));
> - break;
> - case E_V8SFmode:
> - emit_insn (gen_lasx_xvpermi_q_v8sf (d->target, d->target,
> - d->op0, GEN_INT (0x21)));
> - break;
> - case E_V16HImode:
> - emit_insn (gen_lasx_xvpermi_q_v16hi (d->target, d->target,
> - d->op0, GEN_INT (0x21)));
> - break;
> - case E_V32QImode:
> - emit_insn (gen_lasx_xvpermi_q_v32qi (d->target, d->target,
> - d->op0, GEN_INT (0x21)));
> - break;
> - default:
> - break;
> + rperm[i] = GEN_INT (d->perm[0]);
> }
> + /* Selector after: { 0, 0, 0, 0, 0, 0, 0, 0 }. */
> + flag = true;
> + goto expand_perm_const_end;
> }
> - ok = true;
> - /* Finish the funtion directly. */
> - goto expand_perm_const_2_end;
> - }
> - else if (loongarch_is_reversing_permutation (d))
> - {
> - /* Selector sample: E_V8SImode, { 7, 6, 5, 4, 3, 2, 1, 0 }
> - one_vector_p == true */
> - idx = d->nelt / 2 - 1;
> - for (i = 0; i < d->nelt / 2; i += 1)
> - {
> - remapped[i] = idx;
> - remapped[i + d->nelt / 2] = idx;
> - idx -= 1;
> - }
> - /* Selector after: { 3, 2, 1, 0, 3, 2, 1, 0 }
> - Additional insn will be generated to swap hi and lo 128bit of target
> - register. */
> - reverse_hi_lo = true;
> - }
> - else if (loongarch_is_di_misalign_extract (d)
> - || loongarch_is_si_misalign_extract (d))
> - {
> - /* Selector Sample:
> - DI misalign: E_V4DImode, { 1, 2, 3, 4 }
> - SI misalign: E_V8SImode, { 1, 2, 3, 4, 5, 6, 7, 8 } */
> - if (!d->testing_p)
> - {
> - /* Copy original op0/op1 value to new temp register.
> - In some cases, operand register may be used in multiple place, so
> - we need new regiter instead modify original one, to avoid runtime
> - crashing or wrong value after execution. */
> - use_alt_op = true;
> - op1_alt = gen_reg_rtx (d->vmode);
> - emit_move_insn (op1_alt, d->op1);
> -
> - /* Adjust op1 for selecting correct value in high 128bit of target
> - register.
> - op1: E_V4DImode, { 4, 5, 6, 7 } -> { 2, 3, 4, 5 }. */
> - rtx conv_op1 = simplify_gen_subreg (E_V4DImode, op1_alt, d->vmode, 0);
> - rtx conv_op0 = simplify_gen_subreg (E_V4DImode, d->op0, d->vmode, 0);
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1, conv_op1,
> - conv_op0, GEN_INT (0x21)));
>
> - for (i = 0; i < d->nelt / 2; i += 1)
> - {
> - remapped[i] = d->perm[i];
> - remapped[i + d->nelt / 2] = d->perm[i];
> - }
> - /* Selector after:
> - DI misalign: { 1, 2, 1, 2 }
> - SI misalign: { 1, 2, 3, 4, 1, 2, 3, 4 } */
> - }
> - }
> - else if (loongarch_is_lasx_lowpart_interleave (d))
> - {
> - /* Elements from op0's low 18bit and op1's 128bit are inserted into
> - target register alternately.
> - sample: E_V4DImode, { 0, 4, 1, 5 } */
> - if (!d->testing_p)
> - {
> - /* Prepare temp register instead of modify original op. */
> - use_alt_op = true;
> - op1_alt = gen_reg_rtx (d->vmode);
> - op0_alt = gen_reg_rtx (d->vmode);
> - emit_move_insn (op1_alt, d->op1);
> - emit_move_insn (op0_alt, d->op0);
> -
> - /* Generate subreg for fitting into insn gen function. */
> - rtx conv_op1 = simplify_gen_subreg (E_V4DImode, op1_alt, d->vmode, 0);
> - rtx conv_op0 = simplify_gen_subreg (E_V4DImode, op0_alt, d->vmode, 0);
> -
> - /* Adjust op value in temp register.
> - op0 = {0,1,2,3}, op1 = {4,5,0,1} */
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1, conv_op1,
> - conv_op0, GEN_INT (0x02)));
> - /* op0 = {0,1,4,5}, op1 = {4,5,0,1} */
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0, conv_op0,
> - conv_op1, GEN_INT (0x01)));
> -
> - /* Remap indices in selector based on the location of index inside
> - selector, and vector element numbers in current vector mode. */
> -
> - /* Filling low 128bit of new selector. */
> - for (i = 0; i < d->nelt / 2; i += 1)
> - {
> - /* value in odd-indexed slot of low 128bit part of selector
> - vector. */
> - remapped[i] = i % 2 != 0 ? d->perm[i] - d->nelt / 2 : d->perm[i];
> - }
> - /* Then filling the high 128bit. */
> - for (i = d->nelt / 2; i < d->nelt; i += 1)
> + if (loongarch_is_extraction_permutation (d))
> + {
> + if (d->testing_p)
> + return true;
> + /* Selector sample: E_V8SImode, { 0, 1, 2, 3, 4, 5, 6, 7 }. */
> + if (d->perm[0] == 0)
> {
> - /* value in even-indexed slot of high 128bit part of
> - selector vector. */
> - remapped[i] = i % 2 == 0
> - ? d->perm[i] + (d->nelt / 2) * 3 : d->perm[i];
> + for (i = 0; i < d->nelt / 2; i += 1)
> + {
> + remapped[i] = i;
> + remapped[i + d->nelt / 2] = i;
> + }
> }
> - }
> - }
> - else if (loongarch_is_lasx_lowpart_interleave_2 (d))
> - {
> - /* Special lowpart interleave case in V32QI vector mode. It does the same
> - thing as we can see in if branch that above this line.
> - Selector sample: E_V32QImode,
> - {0, 1, 2, 3, 4, 5, 6, 7, 32, 33, 34, 35, 36, 37, 38, 39, 8,
> - 9, 10, 11, 12, 13, 14, 15, 40, 41, 42, 43, 44, 45, 46, 47} */
> - if (!d->testing_p)
> - {
> - /* Solution for this case in very simple - covert op into V4DI mode,
> - and do same thing as previous if branch. */
> - op1_alt = gen_reg_rtx (d->vmode);
> - op0_alt = gen_reg_rtx (d->vmode);
> - emit_move_insn (op1_alt, d->op1);
> - emit_move_insn (op0_alt, d->op0);
> -
> - rtx conv_op1 = simplify_gen_subreg (E_V4DImode, op1_alt, d->vmode, 0);
> - rtx conv_op0 = simplify_gen_subreg (E_V4DImode, op0_alt, d->vmode, 0);
> - rtx conv_target = simplify_gen_subreg (E_V4DImode, d->target,
> - d->vmode, 0);
> -
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1, conv_op1,
> - conv_op0, GEN_INT (0x02)));
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0, conv_op0,
> - conv_op1, GEN_INT (0x01)));
> - remapped[0] = 0;
> - remapped[1] = 4;
> - remapped[2] = 1;
> - remapped[3] = 5;
> -
> - for (i = 0; i < d->nelt; i += 1)
> + else
> {
> - rperm[i] = GEN_INT (remapped[i]);
> + /* { 8, 9, 10, 11, 12, 13, 14, 15 }. */
> + for (i = 0; i < d->nelt / 2; i += 1)
> + {
> + idx = i + d->nelt / 2;
> + remapped[i] = idx;
> + remapped[i + d->nelt / 2] = idx;
> + }
> }
> + /* Selector after: { 0, 1, 2, 3, 0, 1, 2, 3 }
> + { 8, 9, 10, 11, 8, 9, 10, 11 } */
>
> - sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (4, rperm));
> - sel = force_reg (E_V4DImode, sel);
> - emit_insn (gen_lasx_xvshuf_d (conv_target, sel,
> - conv_op1, conv_op0));
> - }
> -
> - ok = true;
> - goto expand_perm_const_2_end;
> - }
> - else if (loongarch_is_lasx_lowpart_extract (d))
> - {
> - /* Copy op0's low 128bit to target's low 128bit, and copy op1's low
> - 128bit to target's high 128bit.
> - Selector sample: E_V4DImode, { 0, 1, 4 ,5 } */
> - if (!d->testing_p)
> - {
> - rtx conv_op1 = simplify_gen_subreg (E_V4DImode, d->op1, d->vmode, 0);
> - rtx conv_op0 = simplify_gen_subreg (E_V4DImode, d->op0, d->vmode, 0);
> - rtx conv_target = simplify_gen_subreg (E_V4DImode, d->target,
> - d->vmode, 0);
> -
> - /* We can achieve the expectation by using sinple xvpermi.q insn. */
> - emit_move_insn (conv_target, conv_op1);
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_target, conv_target,
> - conv_op0, GEN_INT (0x20)));
> - }
> -
> - ok = true;
> - goto expand_perm_const_2_end;
> - }
> - else if (loongarch_is_lasx_highpart_interleave (d))
> - {
> - /* Similar to lowpart interleave, elements from op0's high 128bit and
> - op1's high 128bit are inserted into target regiter alternately.
> - Selector sample: E_V8SImode, { 4, 12, 5, 13, 6, 14, 7, 15 } */
> - if (!d->testing_p)
> - {
> - /* Prepare temp op register. */
> - use_alt_op = true;
> - op1_alt = gen_reg_rtx (d->vmode);
> - op0_alt = gen_reg_rtx (d->vmode);
> - emit_move_insn (op1_alt, d->op1);
> - emit_move_insn (op0_alt, d->op0);
> -
> - rtx conv_op1 = simplify_gen_subreg (E_V4DImode, op1_alt, d->vmode, 0);
> - rtx conv_op0 = simplify_gen_subreg (E_V4DImode, op0_alt, d->vmode, 0);
> - /* Adjust op value in temp regiter.
> - op0 = { 0, 1, 2, 3 }, op1 = { 6, 7, 2, 3 } */
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1, conv_op1,
> - conv_op0, GEN_INT (0x13)));
> - /* op0 = { 2, 3, 6, 7 }, op1 = { 6, 7, 2, 3 } */
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0, conv_op0,
> - conv_op1, GEN_INT (0x01)));
> - /* Remap indices in selector based on the location of index inside
> - selector, and vector element numbers in current vector mode. */
> -
> - /* Filling low 128bit of new selector. */
> - for (i = 0; i < d->nelt / 2; i += 1)
> - {
> - /* value in even-indexed slot of low 128bit part of selector
> - vector. */
> - remapped[i] = i % 2 == 0 ? d->perm[i] - d->nelt / 2 : d->perm[i];
> - }
> - /* Then filling the high 128bit. */
> - for (i = d->nelt / 2; i < d->nelt; i += 1)
> - {
> - /* value in odd-indexed slot of high 128bit part of selector
> - vector. */
> - remapped[i] = i % 2 != 0
> - ? d->perm[i] - (d->nelt / 2) * 3 : d->perm[i];
> - }
> - }
> - }
> - else if (loongarch_is_lasx_highpart_interleave_2 (d))
> - {
> - /* Special highpart interleave case in V32QI vector mode. It does the
> - same thing as the normal version above.
> - Selector sample: E_V32QImode,
> - {16, 17, 18, 19, 20, 21, 22, 23, 48, 49, 50, 51, 52, 53, 54, 55,
> - 24, 25, 26, 27, 28, 29, 30, 31, 56, 57, 58, 59, 60, 61, 62, 63}
> - */
> - if (!d->testing_p)
> - {
> - /* Convert op into V4DImode and do the things. */
> - op1_alt = gen_reg_rtx (d->vmode);
> - op0_alt = gen_reg_rtx (d->vmode);
> - emit_move_insn (op1_alt, d->op1);
> - emit_move_insn (op0_alt, d->op0);
> -
> - rtx conv_op1 = simplify_gen_subreg (E_V4DImode, op1_alt, d->vmode, 0);
> - rtx conv_op0 = simplify_gen_subreg (E_V4DImode, op0_alt, d->vmode, 0);
> - rtx conv_target = simplify_gen_subreg (E_V4DImode, d->target,
> - d->vmode, 0);
> -
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1, conv_op1,
> - conv_op0, GEN_INT (0x13)));
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0, conv_op0,
> - conv_op1, GEN_INT (0x01)));
> - remapped[0] = 2;
> - remapped[1] = 6;
> - remapped[2] = 3;
> - remapped[3] = 7;
> -
> + /* Convert remapped selector array to RTL array. */
> for (i = 0; i < d->nelt; i += 1)
> {
> rperm[i] = GEN_INT (remapped[i]);
> }
>
> - sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (4, rperm));
> - sel = force_reg (E_V4DImode, sel);
> - emit_insn (gen_lasx_xvshuf_d (conv_target, sel,
> - conv_op1, conv_op0));
> + flag = true;
> + goto expand_perm_const_end;
> }
>
> - ok = true;
> - goto expand_perm_const_2_end;
> - }
> - else if (loongarch_is_elem_duplicate (d))
> - {
> - /* Brocast single element (from op0 or op1) to all slot of target
> - register.
> - Selector sample:E_V8SImode, { 2, 2, 2, 2, 2, 2, 2, 2 } */
> - if (!d->testing_p)
> + if (loongarch_is_elem_duplicate (d))
> {
> + if (d->testing_p)
> + return true;
> + /* Brocast single element (from op0 or op1) to all slot of target
> + register.
> + Selector sample:E_V8SImode, { 2, 2, 2, 2, 2, 2, 2, 2 } */
> rtx conv_op1 = simplify_gen_subreg (E_V4DImode, d->op1, d->vmode, 0);
> rtx conv_op0 = simplify_gen_subreg (E_V4DImode, d->op0, d->vmode, 0);
> rtx temp_reg = gen_reg_rtx (d->vmode);
> rtx conv_temp = simplify_gen_subreg (E_V4DImode, temp_reg,
> d->vmode, 0);
> -
> emit_move_insn (temp_reg, d->op0);
>
> idx = d->perm[0];
> @@ -9847,7 +9200,7 @@ loongarch_expand_vec_perm_const_2 (struct expand_vec_perm_d *d)
> value that we need to broardcast, because xvrepl128vei does the
> broardcast job from every 128bit of source register to
> corresponded part of target register! (A deep sigh.) */
> - if (/*idx >= 0 &&*/ idx < d->nelt / 2)
> + if (idx < d->nelt / 2)
> {
> emit_insn (gen_lasx_xvpermi_q_v4di (conv_temp, conv_temp,
> conv_op0, GEN_INT (0x0)));
> @@ -9902,310 +9255,75 @@ loongarch_expand_vec_perm_const_2 (struct expand_vec_perm_d *d)
> break;
> }
>
> - /* finish func directly. */
> - ok = true;
> - goto expand_perm_const_2_end;
> - }
> - }
> - else if (loongarch_is_op_reverse_perm (d))
> - {
> - /* reverse high 128bit and low 128bit in op0.
> - Selector sample: E_V4DFmode, { 2, 3, 0, 1 }
> - Use xvpermi.q for doing this job. */
> - if (!d->testing_p)
> - {
> - if (d->vmode == E_V4DImode)
> - {
> - emit_insn (gen_lasx_xvpermi_q_v4di (d->target, d->target, d->op0,
> - GEN_INT (0x01)));
> - }
> - else if (d->vmode == E_V4DFmode)
> - {
> - emit_insn (gen_lasx_xvpermi_q_v4df (d->target, d->target, d->op0,
> - GEN_INT (0x01)));
> - }
> - else
> - {
> - gcc_unreachable ();
> - }
> - }
> -
> - ok = true;
> - goto expand_perm_const_2_end;
> - }
> - else if (loongarch_is_single_op_perm (d))
> - {
> - /* Permutation that only select elements from op0. */
> - if (!d->testing_p)
> - {
> - /* Prepare temp register instead of modify original op. */
> - use_alt_op = true;
> - op0_alt = gen_reg_rtx (d->vmode);
> - op1_alt = gen_reg_rtx (d->vmode);
> -
> - emit_move_insn (op0_alt, d->op0);
> - emit_move_insn (op1_alt, d->op1);
> -
> - rtx conv_op0 = simplify_gen_subreg (E_V4DImode, d->op0, d->vmode, 0);
> - rtx conv_op0a = simplify_gen_subreg (E_V4DImode, op0_alt,
> - d->vmode, 0);
> - rtx conv_op1a = simplify_gen_subreg (E_V4DImode, op1_alt,
> - d->vmode, 0);
> -
> - /* Duplicate op0's low 128bit in op0, then duplicate high 128bit
> - in op1. After this, xvshuf.* insn's selector argument can
> - access all elements we need for correct permutation result. */
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0a, conv_op0a, conv_op0,
> - GEN_INT (0x00)));
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1a, conv_op1a, conv_op0,
> - GEN_INT (0x11)));
> -
> - /* In this case, there's no need to remap selector's indices. */
> - for (i = 0; i < d->nelt; i += 1)
> - {
> - remapped[i] = d->perm[i];
> - }
> + return true;
> }
> - }
> - else if (loongarch_is_divisible_perm (d))
> - {
> - /* Divisible perm:
> - Low 128bit of selector only selects elements of op0,
> - and high 128bit of selector only selects elements of op1. */
>
> - if (!d->testing_p)
> +expand_perm_const_end:
> + if (flag)
> {
> - /* Prepare temp register instead of modify original op. */
> - use_alt_op = true;
> - op0_alt = gen_reg_rtx (d->vmode);
> - op1_alt = gen_reg_rtx (d->vmode);
> -
> - emit_move_insn (op0_alt, d->op0);
> - emit_move_insn (op1_alt, d->op1);
> -
> - rtx conv_op0a = simplify_gen_subreg (E_V4DImode, op0_alt,
> - d->vmode, 0);
> - rtx conv_op1a = simplify_gen_subreg (E_V4DImode, op1_alt,
> - d->vmode, 0);
> - rtx conv_op0 = simplify_gen_subreg (E_V4DImode, d->op0, d->vmode, 0);
> - rtx conv_op1 = simplify_gen_subreg (E_V4DImode, d->op1, d->vmode, 0);
> -
> - /* Reorganize op0's hi/lo 128bit and op1's hi/lo 128bit, to make sure
> - that selector's low 128bit can access all op0's elements, and
> - selector's high 128bit can access all op1's elements. */
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op0a, conv_op0a, conv_op1,
> - GEN_INT (0x02)));
> - emit_insn (gen_lasx_xvpermi_q_v4di (conv_op1a, conv_op1a, conv_op0,
> - GEN_INT (0x31)));
> -
> - /* No need to modify indices. */
> - for (i = 0; i < d->nelt;i += 1)
> + /* Copy selector vector from memory to vector register for later insn
> + gen function.
> + If vector's element in floating point value, we cannot fit
> + selector argument into insn gen function directly, because of the
> + insn template definition. As a solution, generate a integral mode
> + subreg of target, then copy selector vector (that is in integral
> + mode) to this subreg. */
> + switch (d->vmode)
> {
> - remapped[i] = d->perm[i];
> + case E_V4DFmode:
> + sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (d->nelt,
> + rperm));
> + tmp = simplify_gen_subreg (E_V4DImode, d->target, d->vmode, 0);
> + emit_move_insn (tmp, sel);
> + break;
> + case E_V8SFmode:
> + sel = gen_rtx_CONST_VECTOR (E_V8SImode, gen_rtvec_v (d->nelt,
> + rperm));
> + tmp = simplify_gen_subreg (E_V8SImode, d->target, d->vmode, 0);
> + emit_move_insn (tmp, sel);
> + break;
> + default:
> + sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt,
> + rperm));
> + emit_move_insn (d->target, sel);
> + break;
> }
> - }
> - }
> - else if (loongarch_is_triple_stride_extract (d))
> - {
> - /* Selector sample: E_V4DFmode, { 1, 4, 7, 0 }. */
> - if (!d->testing_p)
> - {
> - /* Resolve it with brute force modification. */
> - remapped[0] = 1;
> - remapped[1] = 2;
> - remapped[2] = 3;
> - remapped[3] = 0;
> - }
> - }
> - else
> - {
> - /* When all of the detections above are failed, we will try last
> - strategy.
> - The for loop tries to detect following rules based on indices' value,
> - its position inside of selector vector ,and strange behavior of
> - xvshuf.* insn; Then we take corresponding action. (Replace with new
> - value, or give up whole permutation expansion.) */
> - for (i = 0; i < d->nelt; i += 1)
> - {
> - /* % (2 * d->nelt) */
> - idx = d->perm[i];
>
> - /* if index is located in low 128bit of selector vector. */
> - if (i < d->nelt / 2)
> - {
> - /* Fail case 1: index tries to reach element that located in op0's
> - high 128bit. */
> - if (idx >= d->nelt / 2 && idx < d->nelt)
> - {
> - goto expand_perm_const_2_end;
> - }
> - /* Fail case 2: index tries to reach element that located in
> - op1's high 128bit. */
> - if (idx >= (d->nelt + d->nelt / 2))
> - {
> - goto expand_perm_const_2_end;
> - }
> + target = d->target;
> + op0 = d->op0;
> + op1 = d->one_vector_p ? d->op0 : d->op1;
>
> - /* Success case: index tries to reach elements that located in
> - op1's low 128bit. Apply - (nelt / 2) offset to original
> - value. */
> - if (idx >= d->nelt && idx < (d->nelt + d->nelt / 2))
> - {
> - idx -= d->nelt / 2;
> - }
> - }
> - /* if index is located in high 128bit of selector vector. */
> - else
> + /* We FINALLY can generate xvshuf.* insn. */
> + switch (d->vmode)
> {
> - /* Fail case 1: index tries to reach element that located in
> - op1's low 128bit. */
> - if (idx >= d->nelt && idx < (d->nelt + d->nelt / 2))
> - {
> - goto expand_perm_const_2_end;
> - }
> - /* Fail case 2: index tries to reach element that located in
> - op0's low 128bit. */
> - if (idx < (d->nelt / 2))
> - {
> - goto expand_perm_const_2_end;
> - }
> - /* Success case: index tries to reach element that located in
> - op0's high 128bit. */
> - if (idx >= d->nelt / 2 && idx < d->nelt)
> - {
> - idx -= d->nelt / 2;
> - }
> + case E_V4DFmode:
> + emit_insn (gen_lasx_xvshuf_d_f (target, target, op1, op0));
> + break;
> + case E_V4DImode:
> + emit_insn (gen_lasx_xvshuf_d (target, target, op1, op0));
> + break;
> + case E_V8SFmode:
> + emit_insn (gen_lasx_xvshuf_w_f (target, target, op1, op0));
> + break;
> + case E_V8SImode:
> + emit_insn (gen_lasx_xvshuf_w (target, target, op1, op0));
> + break;
> + case E_V16HImode:
> + emit_insn (gen_lasx_xvshuf_h (target, target, op1, op0));
> + break;
> + case E_V32QImode:
> + emit_insn (gen_lasx_xvshuf_b (target, op1, op0, target));
> + break;
> + default:
> + gcc_unreachable ();
> + break;
> }
> - /* No need to process other case that we did not mentioned. */
> -
> - /* Assign with original or processed value. */
> - remapped[i] = idx;
> - }
> - }
> -
> - ok = true;
> - /* If testing_p is true, compiler is trying to figure out that backend can
> - handle this permutation, but doesn't want to generate actual insn. So
> - if true, exit directly. */
> - if (d->testing_p)
> - {
> - goto expand_perm_const_2_end;
> - }
> -
> - /* Convert remapped selector array to RTL array. */
> - for (i = 0; i < d->nelt; i += 1)
> - {
> - rperm[i] = GEN_INT (remapped[i]);
> - }
> -
> - /* Copy selector vector from memory to vector regiter for later insn gen
> - function.
> - If vector's element in floating point value, we cannot fit selector
> - argument into insn gen function directly, because of the insn template
> - definition. As a solution, generate a integral mode subreg of target,
> - then copy selector vector (that is in integral mode) to this subreg. */
> - switch (d->vmode)
> - {
> - case E_V4DFmode:
> - sel = gen_rtx_CONST_VECTOR (E_V4DImode, gen_rtvec_v (d->nelt, rperm));
> - tmp = simplify_gen_subreg (E_V4DImode, d->target, d->vmode, 0);
> - emit_move_insn (tmp, sel);
> - break;
> - case E_V8SFmode:
> - sel = gen_rtx_CONST_VECTOR (E_V8SImode, gen_rtvec_v (d->nelt, rperm));
> - tmp = simplify_gen_subreg (E_V8SImode, d->target, d->vmode, 0);
> - emit_move_insn (tmp, sel);
> - break;
> - default:
> - sel = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (d->nelt, rperm));
> - emit_move_insn (d->target, sel);
> - break;
> - }
> -
> - target = d->target;
> - /* If temp op registers are requested in previous if branch, then use temp
> - register intead of original one. */
> - if (use_alt_op)
> - {
> - op0 = op0_alt != NULL_RTX ? op0_alt : d->op0;
> - op1 = op1_alt != NULL_RTX ? op1_alt : d->op1;
> - }
> - else
> - {
> - op0 = d->op0;
> - op1 = d->one_vector_p ? d->op0 : d->op1;
> - }
> -
> - /* We FINALLY can generate xvshuf.* insn. */
> - switch (d->vmode)
> - {
> - case E_V4DFmode:
> - emit_insn (gen_lasx_xvshuf_d_f (target, target, op1, op0));
> - break;
> - case E_V4DImode:
> - emit_insn (gen_lasx_xvshuf_d (target, target, op1, op0));
> - break;
> - case E_V8SFmode:
> - emit_insn (gen_lasx_xvshuf_w_f (target, target, op1, op0));
> - break;
> - case E_V8SImode:
> - emit_insn (gen_lasx_xvshuf_w (target, target, op1, op0));
> - break;
> - case E_V16HImode:
> - emit_insn (gen_lasx_xvshuf_h (target, target, op1, op0));
> - break;
> - case E_V32QImode:
> - emit_insn (gen_lasx_xvshuf_b (target, op1, op0, target));
> - break;
> - default:
> - gcc_unreachable ();
> - break;
> - }
>
> - /* Extra insn for swapping the hi/lo 128bit of target vector register. */
> - if (reverse_hi_lo)
> - {
> - switch (d->vmode)
> - {
> - case E_V4DFmode:
> - emit_insn (gen_lasx_xvpermi_q_v4df (d->target, d->target,
> - d->target, GEN_INT (0x1)));
> - break;
> - case E_V4DImode:
> - emit_insn (gen_lasx_xvpermi_q_v4di (d->target, d->target,
> - d->target, GEN_INT (0x1)));
> - break;
> - case E_V8SFmode:
> - emit_insn (gen_lasx_xvpermi_q_v8sf (d->target, d->target,
> - d->target, GEN_INT (0x1)));
> - break;
> - case E_V8SImode:
> - emit_insn (gen_lasx_xvpermi_q_v8si (d->target, d->target,
> - d->target, GEN_INT (0x1)));
> - break;
> - case E_V16HImode:
> - emit_insn (gen_lasx_xvpermi_q_v16hi (d->target, d->target,
> - d->target, GEN_INT (0x1)));
> - break;
> - case E_V32QImode:
> - emit_insn (gen_lasx_xvpermi_q_v32qi (d->target, d->target,
> - d->target, GEN_INT (0x1)));
> - break;
> - default:
> - break;
> + return true;
> }
> }
> - /* Extra insn required by odd/even extraction. Swapping the second and third
> - 64bit in target vector register. */
> - else if (extract_ev_od)
> - {
> - rtx converted = simplify_gen_subreg (E_V4DImode, d->target, d->vmode, 0);
> - emit_insn (gen_lasx_xvpermi_d_v4di (converted, converted,
> - GEN_INT (0xD8)));
> - }
>
> -expand_perm_const_2_end:
> - return ok;
> + return false;
> }
>
> /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
> @@ -10289,25 +9407,19 @@ loongarch_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> if (!d.one_vector_p)
> d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
>
> - ok = loongarch_expand_vec_perm_const_2 (&d);
> - if (ok)
> - return ok;
> -
> start_sequence ();
> - ok = loongarch_expand_vec_perm_const_1 (&d);
> + ok = loongarch_expand_vec_perm_const (&d);
> end_sequence ();
> return ok;
> }
>
> - ok = loongarch_expand_vec_perm_const_2 (&d);
> - if (!ok)
> - ok = loongarch_expand_vec_perm_const_1 (&d);
> + ok = loongarch_expand_vec_perm_const (&d);
>
> /* If we were given a two-vector permutation which just happened to
> have both input vectors equal, we folded this into a one-vector
> permutation. There are several loongson patterns that are matched
> via direct vec_select+vec_concat expansion, but we do not have
> - support in loongarch_expand_vec_perm_const_1 to guess the adjustment
> + support in loongarch_expand_vec_perm_const to guess the adjustment
> that should be made for a single operand. Just try again with
> the original permutation. */
> if (!ok && which == 3)
> @@ -10316,7 +9428,7 @@ loongarch_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
> d.op1 = op1;
> d.one_vector_p = false;
> memcpy (d.perm, orig_perm, MAX_VECT_LEN);
> - ok = loongarch_expand_vec_perm_const_1 (&d);
> + ok = loongarch_expand_vec_perm_const (&d);
> }
>
> return ok;
prev parent reply other threads:[~2024-01-04 6:19 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-28 12:26 Li Wei
2024-01-04 6:18 ` chenglulu [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=da9824bc-a269-501e-ec48-cba0e01af3ad@loongson.cn \
--to=chenglulu@loongson.cn \
--cc=gcc-patches@gcc.gnu.org \
--cc=i@xen0n.name \
--cc=liwei@loongson.cn \
--cc=xry111@xry111.site \
--cc=xuchenghua@loongson.cn \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).