On Tue, 25 Jul 2023 at 18:25, Richard Sandiford wrote: > > Hi, > > Thanks for the rework and sorry for the slow review. Hi Richard, Thanks for the suggestions! Please find my responses inline below. > > Prathamesh Kulkarni writes: > > Hi Richard, > > This is reworking of patch to extend fold_vec_perm to handle VLA vectors. > > The attached patch unifies handling of VLS and VLA vector_csts, while > > using fallback code > > for ctors. > > > > For VLS vector, the patch ignores underlying encoding, and > > uses npatterns = nelts, and nelts_per_pattern = 1. > > > > For VLA patterns, if sel has a stepped sequence, then it > > only chooses elements from a particular pattern of a particular > > input vector. > > > > To make things simpler, the patch imposes following constraints: > > (a) op0_npatterns, op1_npatterns and sel_npatterns are powers of 2. > > (b) The step size for a stepped sequence is a power of 2, and > > multiple of npatterns of chosen input vector. > > (c) Runtime vector length of sel is a multiple of sel_npatterns. > > So, we don't handle sel.length = 2 + 2x and npatterns = 4. > > > > Eg: > > op0, op1: npatterns = 2, nelts_per_pattern = 3 > > op0_len = op1_len = 16 + 16x. > > sel = { 0, 0, 2, 0, 4, 0, ... } > > npatterns = 2, nelts_per_pattern = 3. > > > > For pattern {0, 2, 4, ...} > > Let, > > a1 = 2 > > S = step size = 2 > > > > Let Esel denote number of elements per pattern in sel at runtime. > > Esel = (16 + 16x) / npatterns_sel > > = (16 + 16x) / 2 > > = (8 + 8x) > > > > So, last element of pattern: > > ae = a1 + (Esel - 2) * S > > = 2 + (8 + 8x - 2) * 2 > > = 14 + 16x > > > > a1 /trunc arg0_len = 2 / (16 + 16x) = 0 > > ae /trunc arg0_len = (14 + 16x) / (16 + 16x) = 0 > > Since both are equal with quotient = 0, we select elements from op0. > > > > Since step size (S) is a multiple of npatterns(op0), we select > > all elements from same pattern of op0. > > > > res_npatterns = max (op0_npatterns, max (op1_npatterns, sel_npatterns)) > > = max (2, max (2, 2) > > = 2 > > > > res_nelts_per_pattern = max (op0_nelts_per_pattern, > > max (op1_nelts_per_pattern, > > sel_nelts_per_pattern)) > > = max (3, max (3, 3)) > > = 3 > > > > So res has encoding with npatterns = 2, nelts_per_pattern = 3. > > res: { op0[0], op0[0], op0[2], op0[0], op0[4], op0[0], ... } > > > > Unfortunately, this results in an issue for poly_int_cst index: > > For example, > > op0, op1: npatterns = 1, nelts_per_pattern = 3 > > op0_len = op1_len = 4 + 4x > > > > sel: { 4 + 4x, 5 + 4x, 6 + 4x, ... } // should choose op1 > > > > In this case, > > a1 = 5 + 4x > > S = (6 + 4x) - (5 + 4x) = 1 > > Esel = 4 + 4x > > > > ae = a1 + (esel - 2) * S > > = (5 + 4x) + (4 + 4x - 2) * 1 > > = 7 + 8x > > > > IIUC, 7 + 8x will always be index for last element of op1 ? > > if x = 0, len = 4, 7 + 8x = 7 > > if x = 1, len = 8, 7 + 8x = 15, etc. > > So the stepped sequence will always choose elements > > from op1 regardless of vector length for above case ? > > > > However, > > ae /trunc op0_len > > = (7 + 8x) / (4 + 4x) > > which is not defined because 7/4 != 8/4 > > and we return NULL_TREE, but I suppose the expected result would be: > > res: { op1[0], op1[1], op1[2], ... } ? > > > > The patch passes bootstrap+test on aarch64-linux-gnu with and without sve, > > and on x86_64-unknown-linux-gnu. > > I would be grateful for suggestions on how to proceed. > > > > Thanks, > > Prathamesh > > > > diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc > > index a02ede79fed..8028b3e8e9a 100644 > > --- a/gcc/fold-const.cc > > +++ b/gcc/fold-const.cc > > @@ -85,6 +85,10 @@ along with GCC; see the file COPYING3. If not see > > #include "vec-perm-indices.h" > > #include "asan.h" > > #include "gimple-range.h" > > +#include > > +#include "tree-pretty-print.h" > > +#include "gimple-pretty-print.h" > > +#include "print-tree.h" > > > > /* Nonzero if we are folding constants inside an initializer or a C++ > > manifestly-constant-evaluated context; zero otherwise. > > @@ -10493,15 +10497,9 @@ fold_mult_zconjz (location_t loc, tree type, tree expr) > > static bool > > vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts) > > { > > - unsigned HOST_WIDE_INT i, nunits; > > + unsigned HOST_WIDE_INT i; > > > > - if (TREE_CODE (arg) == VECTOR_CST > > - && VECTOR_CST_NELTS (arg).is_constant (&nunits)) > > - { > > - for (i = 0; i < nunits; ++i) > > - elts[i] = VECTOR_CST_ELT (arg, i); > > - } > > - else if (TREE_CODE (arg) == CONSTRUCTOR) > > + if (TREE_CODE (arg) == CONSTRUCTOR) > > { > > constructor_elt *elt; > > > > @@ -10519,6 +10517,230 @@ vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts) > > return true; > > } > > > > +/* Return a vector with (NPATTERNS, NELTS_PER_PATTERN) encoding. */ > > + > > +static tree > > +vector_cst_reshape (tree vec, unsigned npatterns, unsigned nelts_per_pattern) > > +{ > > + gcc_assert (pow2p_hwi (npatterns)); > > + > > + if (VECTOR_CST_NPATTERNS (vec) == npatterns > > + && VECTOR_CST_NELTS_PER_PATTERN (vec) == nelts_per_pattern) > > + return vec; > > + > > + tree v = make_vector (exact_log2 (npatterns), nelts_per_pattern); > > + TREE_TYPE (v) = TREE_TYPE (vec); > > + > > + unsigned nelts = npatterns * nelts_per_pattern; > > + for (unsigned i = 0; i < nelts; i++) > > + VECTOR_CST_ENCODED_ELT(v, i) = vector_cst_elt (vec, i); > > + return v; > > +} > > + > > +/* Helper routine for fold_vec_perm_vla to check if ARG is a suitable > > + operand for VLA vec_perm folding. If arg is VLS, then set > > + NPATTERNS = nelts and NELTS_PER_PATTERN = 1. */ > > + > > +static tree > > +valid_operand_for_fold_vec_perm_cst_p (tree arg) > > +{ > > + if (TREE_CODE (arg) != VECTOR_CST) > > + return NULL_TREE; > > + > > + unsigned HOST_WIDE_INT nelts; > > + unsigned npatterns, nelts_per_pattern; > > + if (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg)).is_constant (&nelts)) > > + { > > + npatterns = nelts; > > + nelts_per_pattern = 1; > > + } > > + else > > + { > > + npatterns = VECTOR_CST_NPATTERNS (arg); > > + nelts_per_pattern = VECTOR_CST_NELTS_PER_PATTERN (arg); > > + } > > + > > + if (!pow2p_hwi (npatterns)) > > + return NULL_TREE; > > + > > + return vector_cst_reshape (arg, npatterns, nelts_per_pattern); > > +} > > I don't think we should reshape the vectors for VLS, since it would > create more nodes for GC to clean up later. Also, the "compact" encoding > is canonical even for VLS, so the reshaping would effectively create > noncanonical constants (even if only temporarily). Removed in the attached patch. > > Instead, I think we should change the later: > > > + if (!valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, sel_npatterns, > > + sel_nelts_per_pattern, reason, verbose)) > > + return NULL_TREE; > > so that it comes after the computation of res_npatterns and > res_nelts_per_pattern. Then, if valid_mask_for_fold_vec_perm_cst_p > returns false, and if the result type has a constant number of elements, > we should: > > * set res_npatterns to that number of elements > * set res_nelts_per_pattern to 1 > * continue instead of returning null Assuming we don't enforce only VLA or only VLS for input vectors and sel, won't that be still an issue if res (and sel) is VLS, and input vectors are VLA ? For eg: arg0, arg1 are type VNx4SI with npatterns = 2, nelts_per_pattern = 3, step = 2 sel is V4SI constant with encoding { 0, 2, 4, ... } and res_type is V4SI. In this case, when it comes to index 4, the vector selection becomes ambiguous, since it can be arg1 for len = 4 + 4x, and arg0 for lengths > 4 + 4x ? In the attached patch if sel is not a suitable mask, and input vectors have constant length, then it sets: res_npatterns = nelts of input vector res_nelts_per_pattern = 1 Does that look OK ? This part of the code has few tests in test_mixed(). > > The loop that follows will then do the correct thing for each element. > > The check for a power of 2 would then go in > valid_mask_for_fold_vec_perm_cst_p rather than > valid_operand_for_fold_vec_perm_cst_p. > > With that change, I think: > > > + > > +/* Helper routine for fold_vec_perm_cst to check if SEL is a suitable > > + mask for VLA vec_perm folding. Set SEL_NPATTERNS and SEL_NELTS_PER_PATTERN > > + similarly. */ > > + > > +static bool > > +valid_mask_for_fold_vec_perm_cst_p (tree arg0, tree arg1, > > + const vec_perm_indices &sel, > > + unsigned& sel_npatterns, > > + unsigned& sel_nelts_per_pattern, > > + char *reason = NULL, > > + bool verbose = false) > > +{ > > + unsigned HOST_WIDE_INT nelts; > > + if (sel.length ().is_constant (&nelts)) > > + { > > + sel_npatterns = nelts; > > + sel_nelts_per_pattern = 1; > > + } > > + else > > + { > > + sel_npatterns = sel.encoding ().npatterns (); > > + sel_nelts_per_pattern = sel.encoding ().nelts_per_pattern (); > > + } > > ...we should use the "else" code unconditionally. Done. > > The function comment should describe "reason" and "verbose". It looks > like these are debug parameters, is that right? Yes, verbose is just for printf debugging, and reason is used in unit-tests when result is NULL_TREE, to verify that it's NULL for the intended reason, and not due to something else. > > > + > > + if (!pow2p_hwi (sel_npatterns)) > > + { > > + if (reason) > > + strcpy (reason, "sel_npatterns is not power of 2"); > > + return false; > > + } > > + > > + /* We want to avoid cases where sel.length is not a multiple of npatterns. > > + For eg: sel.length = 2 + 2x, and sel npatterns = 4. */ > > + poly_uint64 esel; > > + if (!multiple_p (sel.length (), sel_npatterns, &esel)) > > + { > > + if (reason) > > + strcpy (reason, "sel.length is not multiple of sel_npatterns"); > > + return false; > > + } > > + > > + if (sel_nelts_per_pattern < 3) > > + return true; > > + > > + for (unsigned pattern = 0; pattern < sel_npatterns; pattern++) > > + { > > + poly_uint64 a1 = sel[pattern + sel_npatterns]; > > + poly_uint64 a2 = sel[pattern + 2 * sel_npatterns]; > > + > > + poly_uint64 step = a2 - a1; > > + if (!step.is_constant ()) > > + { > > + if (reason) > > + strcpy (reason, "step is not constant"); > > + return false; > > + } > > + int S = step.to_constant (); > > + if (S == 0) > > + continue; > > Might be simpler as: > > HOST_WIDE_INT step; > if (!(a2 - a1).is_constant (&step)) > { > ... > } This resulted in following error: ../../gcc/gcc/fold-const.cc:10563:34: error: no matching function for call to ‘poly_int<2, long unsigned int>::is_constant(long int*)’ 10563 | if (!(a2 - a1).is_constant (&S)) | ~~~~~~~~~~~~~~~~~~~~~~^~~~ > if (step == 0) > continue; > > > + > > + // FIXME: Punt on S < 0 for now, revisit later. > > + if (S < 0) > > + return false; > > + > > + if (!pow2p_hwi (S)) > > + { > > + if (reason) > > + strcpy (reason, "step is not power of 2"); > > + return false; > > + } > > + > > + poly_uint64 ae = a1 + (esel - 2) * S; > > + poly_uint64 arg_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + uint64_t q1, qe; > > + poly_uint64 r1, re; > > + > > + /* Ensure that stepped sequence of the pattern selects elements > > + only from the same input vector. */ > > + if (!(can_div_trunc_p (a1, arg_len, &q1, &r1) > > + && can_div_trunc_p (ae, arg_len, &qe, &re) > > + && (q1 == qe))) > > Nit: redundant brackets around q1 == qe Fixed, thanks. > > > + { > > + if (reason) > > + strcpy (reason, "crossed input vectors"); > > + return false; > > + } > > + unsigned arg_npatterns > > + = ((q1 & 0) == 0) ? VECTOR_CST_NPATTERNS (arg0) > > + : VECTOR_CST_NPATTERNS (arg1); > > + > > + gcc_assert (pow2p_hwi (arg_npatterns)); > > + if (S < arg_npatterns) > > Since (as the reason string says) the condition is logically that the > step is a multiple of the number of patterns, rather than simply bigger, > I think it's more obvious to write it as: > > if (!multiple_p (step, arg_npatterns)) > > without the gcc_assert. Fixed, thanks. > > > + { > > + if (reason) > > + strcpy (reason, "S is not multiple of npatterns"); > > + return false; > > + } > > + } > > + > > + return true; > > +} > > + > > +/* Try to fold permutation of ARG0 and ARG1 with SEL selector when > > + the input vectors are VECTOR_CST. Return NULL_TREE otherwise. */ > > + > > +static tree > > +fold_vec_perm_cst (tree type, tree arg0, tree arg1, const vec_perm_indices &sel, > > + char *reason = NULL, bool verbose = false) > > +{ > > + /* Allow cases where: > > + (1) arg0, arg1 and sel are VLS. > > + (2) arg0, arg1, and sel are VLA. > > + Punt if input vectors are VLA but sel is VLS or vice-versa. */ > > + poly_uint64 arg_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + if (!((arg_len.is_constant () && sel.length ().is_constant ()) > > + || (!arg_len.is_constant () && !sel.length ().is_constant ()))) > > + return NULL_TREE; > > With the changes above, I'm not sure we need to enforce this. Removed. > > > + unsigned sel_npatterns, sel_nelts_per_pattern; > > + > > + arg0 = valid_operand_for_fold_vec_perm_cst_p (arg0); > > + if (!arg0) > > + return NULL_TREE; > > + > > + arg1 = valid_operand_for_fold_vec_perm_cst_p (arg1); > > + if (!arg1) > > + return NULL_TREE; > > + > > + if (!valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, sel_npatterns, > > + sel_nelts_per_pattern, reason, verbose)) > > + return NULL_TREE; > > + > > + unsigned res_npatterns > > + = std::max (VECTOR_CST_NPATTERNS (arg0), > > + std::max (VECTOR_CST_NPATTERNS (arg1), sel_npatterns)); > > + > > + unsigned res_nelts_per_pattern > > + = std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0), > > + std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1), > > + sel_nelts_per_pattern)); > > + > > + > > Nit: too much vertical space. Fixed, thanks. > > > + tree_vector_builder out_elts (type, res_npatterns, res_nelts_per_pattern); > > + unsigned res_nelts = res_npatterns * res_nelts_per_pattern; > > + for (unsigned i = 0; i < res_nelts; i++) > > + { > > + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + uint64_t q; > > + poly_uint64 r; > > + unsigned HOST_WIDE_INT index; > > + > > + /* Punt if sel[i] /trunc_div len cannot be determined, > > + because the input vector to be chosen will depend on > > + runtime vector length. > > + For example if len == 4 + 4x, and sel[i] == 4, > > + If len at runtime equals 4, we choose arg1[0]. > > + For any other value of len > 4 at runtime, we choose arg0[4]. > > + which makes the element choice dependent on runtime vector length. */ > > + if (!can_div_trunc_p (sel[i], len, &q, &r)) > > + return NULL_TREE; > > + > > + /* sel[i] % len will give the index of element in the chosen input > > + vector. For example if sel[i] == 5 + 4x and len == 4 + 4x, > > + we will choose arg1[1] since (5 + 4x) % (4 + 4x) == 1. */ > > + if (!r.is_constant (&index)) > > + return NULL_TREE; > > + > > + tree arg = ((q & 1) == 0) ? arg0 : arg1; > > + tree elem = vector_cst_elt (arg, index); > > + out_elts.quick_push (elem); > > + } > > + > > + return out_elts.build (); > > +} > > + > > /* Attempt to fold vector permutation of ARG0 and ARG1 vectors using SEL > > selector. Return the folded VECTOR_CST or CONSTRUCTOR if successful, > > NULL_TREE otherwise. */ > > @@ -10528,43 +10750,39 @@ fold_vec_perm (tree type, tree arg0, tree arg1, const vec_perm_indices &sel) > > { > > unsigned int i; > > unsigned HOST_WIDE_INT nelts; > > - bool need_ctor = false; > > > > - if (!sel.length ().is_constant (&nelts)) > > - return NULL_TREE; > > - gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type), nelts) > > - && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)), nelts) > > - && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)), nelts)); > > + gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type), sel.length ()) > > + && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)), > > + TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)))); > > + > > if (TREE_TYPE (TREE_TYPE (arg0)) != TREE_TYPE (type) > > || TREE_TYPE (TREE_TYPE (arg1)) != TREE_TYPE (type)) > > return NULL_TREE; > > > > + if (TREE_CODE (arg0) == VECTOR_CST > > + && TREE_CODE (arg1) == VECTOR_CST) > > + return fold_vec_perm_cst (type, arg0, arg1, sel); > > + > > + /* For fall back case, we want to ensure arg and sel have same len. */ > > + if (!(sel.length ().is_constant (&nelts) > > + && known_eq (sel.length (), TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0))))) > > + return NULL_TREE; > > I think the known_eq should be an unconditional assert (after the call > to fold_vec_perm_cst but before the return NULL_TREE). Um sorry I am not sure if I understood this correctly. In the patch, I placed the assert unconditionally after checking sel is constant. Does that look OK ? > > Looks good otherwise. I'll do a separate review for the tests (but they > look pretty extensive, thanks). Thanks. The attached patch passes bootstrap+test on aarch64-linux-gnu with and without SVE enabled, and on x86_64-linux-gnu. Could you please elaborate on the following issue, because it still stands in this patch, which is in Case 2 of test_fold_vec_perm_cst::test_stepped. For example, op0, op1: npatterns = 1, nelts_per_pattern = 3 op0_len = op1_len = 4 + 4x sel: { 4 + 4x, 5 + 4x, 6 + 4x, ... } // should choose op1 In this case, a1 = 5 + 4x S = (6 + 4x) - (5 + 4x) = 1 Esel = 4 + 4x ae = a1 + (esel - 2) * S = (5 + 4x) + (4 + 4x - 2) * 1 = 7 + 8x IIUC, 7 + 8x will always be index for last element of op1 ? if x = 0, len = 4, 7 + 8x = 7 if x = 1, len = 8, 7 + 8x = 15, etc. So the stepped sequence will always choose elements from op1 regardless of vector length for above case ? However, ae /trunc op0_len = (7 + 8x) / (4 + 4x) which is not defined because 7/4 != 8/4 and we return NULL_TREE, but I suppose the expected result would be: res: { op1[0], op1[1], op1[2], ... } ? I was wondering if the following approach made sense: Since indices wrap around, take index % 2*arg_len, so the remainder is in interval (0, 2 * arg_len]. If both remainders are in interval [0, arg_len) -> choose arg0 If both remainders are in interval [arg_len, 2 * arg_len) -> choose arg1 If comparison is not determined at compile time or both lie in different intervals, return NULL_TREE. So for above example, r1 = (5 + 4x) % (8 + 8x) = 5 + 4x re = (7 + 8x) % (8 + 8x) = 7 + 8x Since both lie in interval [4+4x, 8+8x) --> choose arg1 If we have say a1 = 4, and arg_len = 4 + 4x, then the comparison becomes ambiguous, and known_lt (4, 4+4x) will return 0. In that case, we return NULL_TREE. Taking Case 4 eg from test_fold_vec_perm_cst::test_stepped(): op0, op1 -> len = 16+16x, npatterns = 1, nelts_per_pattern = 3 sel = { len, 0, 2, ... } npatterns = 1, nelts_per_pattern = 3 a1 = 0 esel = (16 + 16x) / 1 = 16 + 16x S = 2 ae = a1 + (esel - 2) * S = 28 + 32x r1 = 0 % (32 + 32x) = 0 re = (28 + 32x) % (32 + 32x) = 28 + 32x However r1 is in interval [0, 16+16x) and re is in interval [16+16x, 32+32x), so we cross input vectors here and return NULL_TREE. Thanks, Prathamesh > > Richard > > > + > > tree *in_elts = XALLOCAVEC (tree, nelts * 2); > > if (!vec_cst_ctor_to_array (arg0, nelts, in_elts) > > || !vec_cst_ctor_to_array (arg1, nelts, in_elts + nelts)) > > return NULL_TREE; > > > > - tree_vector_builder out_elts (type, nelts, 1); > > + vec *v; > > + vec_alloc (v, nelts); > > for (i = 0; i < nelts; i++) > > { > > HOST_WIDE_INT index; > > if (!sel[i].is_constant (&index)) > > return NULL_TREE; > > - if (!CONSTANT_CLASS_P (in_elts[index])) > > - need_ctor = true; > > - out_elts.quick_push (unshare_expr (in_elts[index])); > > + CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, in_elts[index]); > > } > > - > > - if (need_ctor) > > - { > > - vec *v; > > - vec_alloc (v, nelts); > > - for (i = 0; i < nelts; i++) > > - CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, out_elts[i]); > > - return build_constructor (type, v); > > - } > > - else > > - return out_elts.build (); > > + return build_constructor (type, v); > > } > > > > /* Try to fold a pointer difference of type TYPE two address expressions of > > @@ -16891,6 +17109,388 @@ test_arithmetic_folding () > > x); > > } > > > > +namespace test_fold_vec_perm_cst { > > + > > +static tree > > +get_preferred_vectype (tree inner_type) > > +{ > > + scalar_int_mode int_mode = SCALAR_INT_TYPE_MODE (inner_type); > > + machine_mode vmode = targetm.vectorize.preferred_simd_mode (int_mode); > > + poly_uint64 nunits = GET_MODE_NUNITS (vmode); > > + return build_vector_type (inner_type, nunits); > > +} > > + > > +static tree > > +build_vec_cst_rand (tree inner_type, unsigned npatterns, > > + unsigned nelts_per_pattern, int S = 0) > > +{ > > + tree vectype = get_preferred_vectype (inner_type); > > + tree_vector_builder builder (vectype, npatterns, nelts_per_pattern); > > + > > + // Fill a0 for each pattern > > + for (unsigned i = 0; i < npatterns; i++) > > + builder.quick_push (build_int_cst (inner_type, rand () % 100)); > > + > > + if (nelts_per_pattern == 1) > > + return builder.build (); > > + > > + // Fill a1 for each pattern > > + for (unsigned i = 0; i < npatterns; i++) > > + builder.quick_push (build_int_cst (inner_type, rand () % 100)); > > + > > + if (nelts_per_pattern == 2) > > + return builder.build (); > > + > > + for (unsigned i = npatterns * 2; i < npatterns * nelts_per_pattern; i++) > > + { > > + tree prev_elem = builder[i - npatterns]; > > + int prev_elem_val = TREE_INT_CST_LOW (prev_elem); > > + int val = prev_elem_val + S; > > + builder.quick_push (build_int_cst (inner_type, val)); > > + } > > + > > + return builder.build (); > > +} > > + > > +static void > > +validate_res (unsigned npatterns, unsigned nelts_per_pattern, > > + tree res, tree *expected_res) > > +{ > > + ASSERT_TRUE (VECTOR_CST_NPATTERNS (res) == npatterns); > > + ASSERT_TRUE (VECTOR_CST_NELTS_PER_PATTERN (res) == nelts_per_pattern); > > + > > + for (unsigned i = 0; i < vector_cst_encoded_nelts (res); i++) > > + ASSERT_TRUE (operand_equal_p (VECTOR_CST_ELT (res, i), expected_res[i], 0)); > > +} > > + > > +/* Verify VLA vec_perm folding. */ > > + > > +static void > > +test_stepped () > > +{ > > + /* Case 1: sel = {0, 1, 2, ...} > > + npatterns = 1, nelts_per_pattern = 3 */ > > + { > > + tree arg0 = build_vec_cst_rand (char_type_node, 1, 3, 2); > > + tree arg1 = build_vec_cst_rand (char_type_node, 1, 3, 2); > > + poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + > > + vec_perm_builder builder (arg0_len, 1, 3); > > + builder.quick_push (0); > > + builder.quick_push (1); > > + builder.quick_push (2); > > + > > + vec_perm_indices sel (builder, 2, arg0_len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); > > + tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg0, 1), > > + vector_cst_elt (arg0, 2) }; > > + validate_res (1, 3, res, expected_res); > > + } > > + > > +#if 0 > > + /* Case 2: sel = {len, len + 1, len + 2, ... } > > + npatterns = 1, nelts_per_pattern = 3 > > + FIXME: This should return > > + expected res: { op1[0], op1[1], op1[2], ... } > > + however it returns NULL_TREE. */ > > + { > > + vec_perm_builder builder (arg0_len, 1, 3); > > + builder.quick_push (arg0_len); > > + builder.quick_push (arg0_len + 1); > > + builder.quick_push (arg0_len + 2); > > + > > + vec_perm_indices sel (builder, 2, arg0_len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); > > + } > > +#endif > > + > > + /* Case 3: Leading element of arg1, stepped sequence: pattern 0 of arg0. > > + sel = {len, 0, 0, 0, 2, 0, ...} > > + npatterns = 2, nelts_per_pattern = 3. > > + Use extra pattern {0, ...} to lower number of elements per pattern. */ > > + { > > + tree arg0 = build_vec_cst_rand (char_type_node, 1, 3, 2); > > + tree arg1 = build_vec_cst_rand (char_type_node, 1, 3, 2); > > + poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + > > + vec_perm_builder builder (arg0_len, 2, 3); > > + builder.quick_push (arg0_len); > > + int mask_elems[] = { 0, 0, 0, 2, 0 }; > > + for (int i = 0; i < 5; i++) > > + builder.quick_push (mask_elems[i]); > > + > > + vec_perm_indices sel (builder, 2, arg0_len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); > > + gcc_assert (res); > > + } > > + > > + /* Case 4: > > + sel = { len, 0, 2, ... } npatterns = 1, nelts_per_pattern = 3. > > + This should return NULL because we cross the input vectors. > > + Because, > > + arg0_len = 16 + 16x > > + a1 = 0 > > + S = 2 > > + esel = arg0_len / npatterns_sel = 16+16x/1 = 16 + 16x > > + ae = a1 + (esel - 2) * S > > + = 0 + (16 + 16x - 2) * 2 > > + = 28 + 32x > > + a1 / arg0_len = 0 /trunc (16 + 16x) = 0 > > + ae / arg0_len = (28 + 32x) /trunc (16 + 16x), which is not defined, > > + since 28/16 != 32/16. > > + So return NULL_TREE. */ > > + { > > + tree arg0 = build_vec_cst_rand (char_type_node, 1, 3, 2); > > + tree arg1 = build_vec_cst_rand (char_type_node, 1, 3, 2); > > + poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + > > + poly_uint64 ae = (arg0_len - 2) * 2; > > + uint64_t qe; > > + poly_uint64 re; > > + > > + vec_perm_builder builder (arg0_len, 1, 3); > > + builder.quick_push (arg0_len); > > + builder.quick_push (0); > > + builder.quick_push (2); > > + > > + vec_perm_indices sel (builder, 2, arg0_len); > > + char reason[100] = "\0"; > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel, reason, false); > > + gcc_assert (res == NULL_TREE); > > + gcc_assert (!strcmp (reason, "crossed input vectors")); > > + } > > + > > + /* Case 5: Select elements from different patterns. > > + Should return NULL. */ > > + { > > + tree op0 = build_vec_cst_rand (char_type_node, 2, 3, 2); > > + tree op1 = build_vec_cst_rand (char_type_node, 2, 3, 2); > > + poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0)); > > + > > + vec_perm_builder builder (op0_len, 2, 3); > > + builder.quick_push (op0_len); > > + int mask_elems[] = { 0, 0, 0, 1, 0 }; > > + for (int i = 0; i < 5; i++) > > + builder.quick_push (mask_elems[i]); > > + > > + vec_perm_indices sel (builder, 2, op0_len); > > + char reason[100] = "\0"; > > + tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel, reason, false); > > + gcc_assert (res == NULL_TREE); > > + gcc_assert (!strcmp (reason, "S is not multiple of npatterns")); > > + } > > + > > + /* Case 6: Select pattern 0 of op0 and dup of op0[0] > > + op0, op1, sel: npatterns = 2, nelts_per_pattern = 3 > > + sel = { 0, 0, 2, 0, 4, 0, ... }. > > + > > + For pattern {0, 2, 4, ...}: > > + a1 = 2 > > + len = 16 + 16x > > + S = 2 > > + esel = len / npatterns_sel = (16 + 16x) / 2 = (8 + 8x) > > + ae = a1 + (esel - 2) * S > > + = 2 + (8 + 8x - 2) * 2 > > + = 14 + 16x > > + a1 / arg0_len = 2 / (16 + 16x) = 0 > > + ae / arg0_len = (14 + 16x) / (16 + 16x) = 0 > > + So a1/arg0_len = ae/arg0_len = 0 > > + Hence we select from first vector op0 > > + S = 2, npatterns = 2. > > + Since S is multiple of npatterns(op0), we are selecting from > > + same pattern of op0. > > + > > + For pattern {0, ...}, we are choosing { op0[0] ... } > > + So res will be combination of above patterns: > > + res: { op0[0], op0[0], op0[2], op0[0], op0[4], op0[0], ... } > > + with npatterns = 2, nelts_per_pattern = 3. */ > > + { > > + tree op0 = build_vec_cst_rand (char_type_node, 2, 3, 2); > > + tree op1 = build_vec_cst_rand (char_type_node, 2, 3, 2); > > + poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0)); > > + > > + vec_perm_builder builder (op0_len, 2, 3); > > + int mask_elems[] = { 0, 0, 2, 0, 4, 0 }; > > + for (int i = 0; i < 6; i++) > > + builder.quick_push (mask_elems[i]); > > + > > + vec_perm_indices sel (builder, 2, op0_len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel); > > + tree expected_res[] = { vector_cst_elt (op0, 0), vector_cst_elt (op0, 0), > > + vector_cst_elt (op0, 2), vector_cst_elt (op0, 0), > > + vector_cst_elt (op0, 4), vector_cst_elt (op0, 0) }; > > + validate_res (2, 3, res, expected_res); > > + } > > + > > + /* Case 7: sel_npatterns > op_npatterns; > > + op0, op1: npatterns = 2, nelts_per_pattern = 3 > > + sel: { 0, 0, 1, len, 2, 0, 3, len, 4, 0, 5, len, ...}, > > + with npatterns = 4, nelts_per_pattern = 3. */ > > + { > > + tree op0 = build_vec_cst_rand (char_type_node, 2, 3, 2); > > + tree op1 = build_vec_cst_rand (char_type_node, 2, 3, 2); > > + poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0)); > > + > > + vec_perm_builder builder(op0_len, 4, 3); > > + // -1 is used as place holder for poly_int_cst > > + int mask_elems[] = { 0, 0, 1, -1, 2, 0, 3, -1, 4, 0, 5, -1 }; > > + for (int i = 0; i < 12; i++) > > + builder.quick_push ((mask_elems[i] == -1) ? op0_len : mask_elems[i]); > > + > > + vec_perm_indices sel (builder, 2, op0_len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel); > > + tree expected_res[] = { vector_cst_elt (op0, 0), vector_cst_elt (op0, 0), > > + vector_cst_elt (op0, 1), vector_cst_elt (op1, 0), > > + vector_cst_elt (op0, 2), vector_cst_elt (op0, 0), > > + vector_cst_elt (op0, 3), vector_cst_elt (op1, 0), > > + vector_cst_elt (op0, 4), vector_cst_elt (op0, 0), > > + vector_cst_elt (op0, 5), vector_cst_elt (op1, 0) }; > > + validate_res (4, 3, res, expected_res); > > + } > > +} > > + > > +static void > > +test_dup () > > +{ > > + /* Case 1: mask = {0, ...} */ > > + { > > + tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + > > + vec_perm_builder builder (len, 1, 1); > > + builder.quick_push (0); > > + vec_perm_indices sel (builder, 2, len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); > > + > > + tree expected_res[] = { vector_cst_elt (res, 0) }; > > + validate_res (1, 1, res, expected_res); > > + } > > + > > + /* Case 2: mask = {len, ...} */ > > + { > > + tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + > > + vec_perm_builder builder (len, 1, 1); > > + builder.quick_push (len); > > + vec_perm_indices sel (builder, 2, len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); > > + > > + tree expected_res[] = { vector_cst_elt (arg1, 0) }; > > + validate_res (1, 1, res, expected_res); > > + } > > + > > + /* Case 3: mask = { 0, len, ... } */ > > + { > > + tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + > > + vec_perm_builder builder (len, 2, 1); > > + builder.quick_push (0); > > + builder.quick_push (len); > > + vec_perm_indices sel (builder, 2, len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); > > + > > + tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0) }; > > + validate_res (2, 1, res, expected_res); > > + } > > + > > + /* Case 4: mask = { 0, len, 1, len+1, ... } */ > > + { > > + tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + > > + vec_perm_builder builder (len, 2, 2); > > + builder.quick_push (0); > > + builder.quick_push (len); > > + builder.quick_push (1); > > + builder.quick_push (len + 1); > > + vec_perm_indices sel (builder, 2, len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); > > + > > + tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0), > > + vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1) > > + }; > > + validate_res (2, 2, res, expected_res); > > + } > > + > > + /* Case 5: mask = { 0, len, 1, len+1, .... } > > + npatterns = 4, nelts_per_pattern = 1 */ > > + { > > + tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + > > + vec_perm_builder builder (len, 4, 1); > > + builder.quick_push (0); > > + builder.quick_push (len); > > + builder.quick_push (1); > > + builder.quick_push (len + 1); > > + vec_perm_indices sel (builder, 2, len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); > > + > > + tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0), > > + vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1) > > + }; > > + validate_res (4, 1, res, expected_res); > > + } > > + > > + /* Case 6: mask = {0, 4, ...} > > + npatterns = 1, nelts_per_pattern = 2. > > + This should return NULL_TREE because the index 4 may choose > > + from either arg0 or arg1 depending on vector length. */ > > + { > > + tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + > > + vec_perm_builder builder (len, 1, 2); > > + builder.quick_push (0); > > + builder.quick_push (4); > > + vec_perm_indices sel (builder, 2, len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); > > + ASSERT_TRUE (res == NULL_TREE); > > + } > > + > > + /* Case 7: npatterns(arg0) = 4 > npatterns(sel) = 2 > > + mask = {0, len, 1, len + 1, ...} > > + sel_npatterns = 2, sel_nelts_per_pattern = 2. */ > > + { > > + tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1); > > + poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)); > > + > > + vec_perm_builder builder (arg0_len, 2, 2); > > + builder.quick_push (0); > > + builder.quick_push (arg0_len); > > + builder.quick_push (1); > > + builder.quick_push (arg0_len + 1); > > + vec_perm_indices sel (builder, 2, arg0_len); > > + tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel); > > + > > + tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0), > > + vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1) > > + }; > > + validate_res (2, 2, res, expected_res); > > + } > > +} > > + > > +static void > > +test () > > +{ > > + tree vectype = get_preferred_vectype (integer_type_node); > > + if (TYPE_VECTOR_SUBPARTS (vectype).is_constant ()) > > + return; > > + > > + test_dup (); > > + test_stepped (); > > +} > > +}; > > + > > /* Verify that various binary operations on vectors are folded > > correctly. */ > > > > @@ -16942,6 +17542,7 @@ fold_const_cc_tests () > > test_arithmetic_folding (); > > test_vector_folding (); > > test_vec_duplicate_folding (); > > + test_fold_vec_perm_cst::test (); > > } > > > > } // namespace selftest