From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <SRS0=0V4n=DV=arm.com=richard.sandiford@sourceware.org>
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
	by sourceware.org (Postfix) with ESMTP id 4571E3857709
	for <gcc-patches@gcc.gnu.org>; Fri,  4 Aug 2023 15:06:39 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 4571E3857709
Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=arm.com
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=arm.com
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
	by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 877591007;
	Fri,  4 Aug 2023 08:07:21 -0700 (PDT)
Received: from localhost (e121540-lin.manchester.arm.com [10.32.110.72])
	by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 441CC3F6C4;
	Fri,  4 Aug 2023 08:06:38 -0700 (PDT)
From: Richard Sandiford <richard.sandiford@arm.com>
To: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Mail-Followup-To: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>,gcc Patches <gcc-patches@gcc.gnu.org>, richard.sandiford@arm.com
Cc: gcc Patches <gcc-patches@gcc.gnu.org>
Subject: Re: [RFC] [v2] Extend fold_vec_perm to handle VLA vectors
References: <CAAgBjMnk_N=tgPNBhUu91yt8YN0HcCoWgQQYpshHMqhU=6WgAQ@mail.gmail.com>
	<mptmszkdubp.fsf@arm.com>
	<CAAgBjMmOrVxsNT8jK0Ei-RYrhtNVymd7pmaud2tA6=OAJCbpOg@mail.gmail.com>
	<mpt4jlg2sb3.fsf@arm.com> <mptv8dw1d2l.fsf@arm.com>
	<CAAgBjMmuzg54RiUTXVQE06+AZxCM-4w+ohojFND5TWghfsiV8w@mail.gmail.com>
Date: Fri, 04 Aug 2023 16:06:37 +0100
In-Reply-To: <CAAgBjMmuzg54RiUTXVQE06+AZxCM-4w+ohojFND5TWghfsiV8w@mail.gmail.com>
	(Prathamesh Kulkarni's message of "Fri, 4 Aug 2023 15:36:22 +0530")
Message-ID: <mpto7jmyhhe.fsf@arm.com>
User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux)
MIME-Version: 1.0
Content-Type: text/plain
X-Spam-Status: No, score=-26.1 required=5.0 tests=BAYES_00,GIT_PATCH_0,KAM_DMARC_NONE,KAM_DMARC_STATUS,KAM_LAZY_DOMAIN_SECURITY,SPF_HELO_NONE,SPF_NONE,TXREP autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org
List-Id: <gcc-patches.gcc.gnu.org>

Full review this time, sorry for the skipping the tests earlier.

Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
> diff --git a/gcc/fold-const.cc b/gcc/fold-const.cc
> index 7e5494dfd39..680d0e54fd4 100644
> --- a/gcc/fold-const.cc
> +++ b/gcc/fold-const.cc
> @@ -85,6 +85,10 @@ along with GCC; see the file COPYING3.  If not see
>  #include "vec-perm-indices.h"
>  #include "asan.h"
>  #include "gimple-range.h"
> +#include <algorithm>

This should be included by defining INCLUDE_ALGORITHM instead.

> +#include "tree-pretty-print.h"
> +#include "gimple-pretty-print.h"
> +#include "print-tree.h"

Are these still needed, or were they for debugging?

>  
>  /* Nonzero if we are folding constants inside an initializer or a C++
>     manifestly-constant-evaluated context; zero otherwise.
> @@ -10494,15 +10498,9 @@ fold_mult_zconjz (location_t loc, tree type, tree expr)
>  static bool
>  vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
>  {
> -  unsigned HOST_WIDE_INT i, nunits;
> +  unsigned HOST_WIDE_INT i;
>  
> -  if (TREE_CODE (arg) == VECTOR_CST
> -      && VECTOR_CST_NELTS (arg).is_constant (&nunits))
> -    {
> -      for (i = 0; i < nunits; ++i)
> -	elts[i] = VECTOR_CST_ELT (arg, i);
> -    }
> -  else if (TREE_CODE (arg) == CONSTRUCTOR)
> +  if (TREE_CODE (arg) == CONSTRUCTOR)
>      {
>        constructor_elt *elt;
>  
> @@ -10520,6 +10518,192 @@ vec_cst_ctor_to_array (tree arg, unsigned int nelts, tree *elts)
>    return true;
>  }
>  
> +/* Helper routine for fold_vec_perm_cst to check if SEL is a suitable
> +   mask for VLA vec_perm folding.
> +   REASON if specified, will contain the reason why SEL is not suitable.
> +   Used only for debugging and unit-testing.
> +   VERBOSE if enabled is used for debugging output.  */
> +
> +static bool
> +valid_mask_for_fold_vec_perm_cst_p (tree arg0, tree arg1,
> +				    const vec_perm_indices &sel,
> +				    const char **reason = NULL,
> +				    ATTRIBUTE_UNUSED bool verbose = false)

Since verbose is no longer needed (good!), I think we should just remove it.

> +{
> +  unsigned sel_npatterns = sel.encoding ().npatterns ();
> +  unsigned sel_nelts_per_pattern = sel.encoding ().nelts_per_pattern ();
> +
> +  if (!(pow2p_hwi (sel_npatterns)
> +	&& pow2p_hwi (VECTOR_CST_NPATTERNS (arg0))
> +	&& pow2p_hwi (VECTOR_CST_NPATTERNS (arg1))))
> +    {
> +      if (reason)
> +	*reason = "npatterns is not power of 2";
> +      return false;
> +    }
> +
> +  /* We want to avoid cases where sel.length is not a multiple of npatterns.
> +     For eg: sel.length = 2 + 2x, and sel npatterns = 4.  */
> +  poly_uint64 esel;
> +  if (!multiple_p (sel.length (), sel_npatterns, &esel))
> +    {
> +      if (reason)
> +	*reason = "sel.length is not multiple of sel_npatterns";
> +      return false;
> +    }
> +
> +  if (sel_nelts_per_pattern < 3)
> +    return true;
> +
> +  for (unsigned pattern = 0; pattern < sel_npatterns; pattern++)
> +    {
> +      poly_uint64 a1 = sel[pattern + sel_npatterns];
> +      poly_uint64 a2 = sel[pattern + 2 * sel_npatterns];
> +      HOST_WIDE_INT S; 

Trailing whitespace.  The convention is to use lowercase variable
names, so please call this "step".

> +      if (!poly_int64 (a2 - a1).is_constant (&S))
> +	{
> +	  if (reason)
> +	    *reason = "step is not constant";
> +	  return false;
> +	}
> +      // FIXME: Punt on S < 0 for now, revisit later.
> +      if (S < 0)
> +	return false;
> +      if (S == 0)
> +	continue;
> +
> +      if (!pow2p_hwi (S))
> +	{
> +	  if (reason)
> +	    *reason = "step is not power of 2";
> +	  return false;
> +	}
> +
> +      /* Ensure that stepped sequence of the pattern selects elements
> +	 only from the same input vector if it's VLA.  */

s/ if it's VLA//

> +      uint64_t q1, qe;
> +      poly_uint64 r1, re;
> +      poly_uint64 ae = a1 + (esel - 2) * S;
> +      poly_uint64 arg_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +      if (!(can_div_trunc_p (a1, arg_len, &q1, &r1)
> +	    && can_div_trunc_p (ae, arg_len, &qe, &re)
> +	    && q1 == qe))
> +	{
> +	  if (reason)
> +	    *reason = "crossed input vectors";
> +	  return false;
> +	}
> +

Probably worth a comment above the following code too:

  /* Ensure that the stepped sequence always selects from the same
     input pattern.  */

> +      unsigned arg_npatterns
> +	= ((q1 & 0) == 0) ? VECTOR_CST_NPATTERNS (arg0)
> +			  : VECTOR_CST_NPATTERNS (arg1);
> +
> +      if (!multiple_p (S, arg_npatterns))
> +	{
> +	  if (reason)
> +	    *reason = "S is not multiple of npatterns";
> +	  return false;
> +	}
> +    }
> +
> +  return true;
> +}
> +
> +/* Try to fold permutation of ARG0 and ARG1 with SEL selector when
> +   the input vectors are VECTOR_CST. Return NULL_TREE otherwise.
> +   REASON and VERBOSE have same purpose as described in
> +   valid_mask_for_fold_vec_perm_cst_p.
> +
> +   (1) If SEL is a suitable mask as determined by
> +       valid_mask_for_fold_vec_perm_cst_p, then:
> +       res_npatterns = max of npatterns between ARG0, ARG1, and SEL
> +       res_nelts_per_pattern = max of nelts_per_pattern between
> +			       ARG0, ARG1 and SEL.
> +   (2) If SEL is not a suitable mask, and ARG0, ARG1 are VLS,
> +       then:
> +       res_npatterns = nelts in input vector.

s/input vector/result vector/

> +       res_nelts_per_pattern = 1.
> +       This exception is made so that VLS ARG0, ARG1 and SEL work as before.  */

Guess this is personal preference, but (1) and (2) seem more like
implementation details, so I think they belong...

> +
> +static tree
> +fold_vec_perm_cst (tree type, tree arg0, tree arg1, const vec_perm_indices &sel,
> +		   const char **reason = NULL, bool verbose = false)
> +{
> +  unsigned res_npatterns, res_nelts_per_pattern;
> +  unsigned HOST_WIDE_INT res_nelts;
> +

...here instead.

> +  if (valid_mask_for_fold_vec_perm_cst_p (arg0, arg1, sel, reason, verbose))
> +    {
> +      res_npatterns
> +	= std::max (VECTOR_CST_NPATTERNS (arg0),
> +		    std::max (VECTOR_CST_NPATTERNS (arg1),
> +			      sel.encoding ().npatterns ()));
> +
> +      res_nelts_per_pattern
> +	= std::max (VECTOR_CST_NELTS_PER_PATTERN (arg0),
> +		    std::max (VECTOR_CST_NELTS_PER_PATTERN (arg1),
> +			      sel.encoding ().nelts_per_pattern ()));
> +
> +      res_nelts = res_npatterns * res_nelts_per_pattern;
> +    }
> +  else if (TYPE_VECTOR_SUBPARTS (type).is_constant (&res_nelts))
> +    {
> +      res_npatterns = res_nelts;
> +      res_nelts_per_pattern = 1;
> +    }
> +  else
> +    return NULL_TREE;
> +
> +  tree_vector_builder out_elts (type, res_npatterns, res_nelts_per_pattern);
> +  for (unsigned i = 0; i < res_nelts; i++)
> +    {
> +      poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +      uint64_t q;
> +      poly_uint64 r;
> +      unsigned HOST_WIDE_INT index;
> +
> +      unsigned HOST_WIDE_INT arg_nelts;
> +      if (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)).is_constant (&arg_nelts)
> +	  && known_ge (sel[i], poly_int64 (2 * arg_nelts)))
> +	{
> +	  if (reason)
> +	    *reason = "out of bounds access";
> +	  return NULL_TREE;
> +	}

I don't think this is needed.  The selector indices wrap, and the code
below should handle the wrapping correctly.

> +
> +      /* Punt if sel[i] /trunc_div len cannot be determined,
> +	 because the input vector to be chosen will depend on
> +	 runtime vector length.
> +	 For example if len == 4 + 4x, and sel[i] == 4,
> +	 If len at runtime equals 4, we choose arg1[0].
> +	 For any other value of len > 4 at runtime, we choose arg0[4].
> +	 which makes the element choice dependent on runtime vector length.  */
> +      if (!can_div_trunc_p (sel[i], len, &q, &r))
> +	{
> +	  if (reason)
> +	    *reason = "cannot divide selector element by arg len";
> +	  return NULL_TREE;
> +	}
> +
> +      /* sel[i] % len will give the index of element in the chosen input
> +	 vector. For example if sel[i] == 5 + 4x and len == 4 + 4x,
> +	 we will choose arg1[1] since (5 + 4x) % (4 + 4x) == 1.  */
> +      if (!r.is_constant (&index))
> +	{
> +	  if (reason)
> +	    *reason = "remainder is not constant";
> +	  return NULL_TREE;
> +	}
> +
> +      tree arg = ((q & 1) == 0) ? arg0 : arg1;
> +      tree elem = vector_cst_elt (arg, index);
> +      out_elts.quick_push (elem);
> +    }
> +
> +  return out_elts.build ();
> +}
> +
>  /* Attempt to fold vector permutation of ARG0 and ARG1 vectors using SEL
>     selector.  Return the folded VECTOR_CST or CONSTRUCTOR if successful,
>     NULL_TREE otherwise.  */
> @@ -10529,43 +10713,40 @@ fold_vec_perm (tree type, tree arg0, tree arg1, const vec_perm_indices &sel)
>  {
>    unsigned int i;
>    unsigned HOST_WIDE_INT nelts;
> -  bool need_ctor = false;
>  
> -  if (!sel.length ().is_constant (&nelts))
> -    return NULL_TREE;
> -  gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type), nelts)
> -	      && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)), nelts)
> -	      && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1)), nelts));
> +  gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (type), sel.length ())
> +	      && known_eq (TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0)),
> +			   TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg1))));
> +
>    if (TREE_TYPE (TREE_TYPE (arg0)) != TREE_TYPE (type)
>        || TREE_TYPE (TREE_TYPE (arg1)) != TREE_TYPE (type))
>      return NULL_TREE;
>  
> +  if (TREE_CODE (arg0) == VECTOR_CST
> +      && TREE_CODE (arg1) == VECTOR_CST)
> +    return fold_vec_perm_cst (type, arg0, arg1, sel);
> +
> +  /* For fall back case, we want to ensure we have VLS vectors
> +     with equal length.  */
> +  if (!sel.length ().is_constant (&nelts))
> +    return NULL_TREE;
> +
> +  gcc_assert (known_eq (sel.length (), TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0))));

Nit: long line.

>    tree *in_elts = XALLOCAVEC (tree, nelts * 2);
>    if (!vec_cst_ctor_to_array (arg0, nelts, in_elts)
>        || !vec_cst_ctor_to_array (arg1, nelts, in_elts + nelts))
>      return NULL_TREE;
>  
> -  tree_vector_builder out_elts (type, nelts, 1);
> +  vec<constructor_elt, va_gc> *v;
> +  vec_alloc (v, nelts);
>    for (i = 0; i < nelts; i++)
>      {
>        HOST_WIDE_INT index;
>        if (!sel[i].is_constant (&index))
>  	return NULL_TREE;
> -      if (!CONSTANT_CLASS_P (in_elts[index]))
> -	need_ctor = true;
> -      out_elts.quick_push (unshare_expr (in_elts[index]));
> -    }
> -
> -  if (need_ctor)
> -    {
> -      vec<constructor_elt, va_gc> *v;
> -      vec_alloc (v, nelts);
> -      for (i = 0; i < nelts; i++)
> -	CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, out_elts[i]);
> -      return build_constructor (type, v);
> +      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, in_elts[index]);
>      }
> -  else
> -    return out_elts.build ();
> +  return build_constructor (type, v);
>  }
>  
>  /* Try to fold a pointer difference of type TYPE two address expressions of
> @@ -16892,6 +17073,508 @@ test_arithmetic_folding ()
>  				   x);
>  }
>  
> +namespace test_fold_vec_perm_cst {
> +
> +static tree
> +get_preferred_vectype (tree inner_type)
> +{
> +  scalar_int_mode int_mode = SCALAR_INT_TYPE_MODE (inner_type);
> +  machine_mode vmode = targetm.vectorize.preferred_simd_mode (int_mode);
> +  poly_uint64 nunits = GET_MODE_NUNITS (vmode);
> +  return build_vector_type (inner_type, nunits);
> +}
> +
> +static tree
> +build_vec_cst_rand (tree inner_type, unsigned npatterns,
> +		    unsigned nelts_per_pattern, int S = 0,

Similar comment about lowercase variable names here.

> +		    tree vectype = NULL_TREE)
> +{
> +  if (!vectype)
> +    vectype = get_preferred_vectype (inner_type);

I'm not sure how portable this is.  It looks like the tests rely on
the integer_type_node vectors being 4 + 4x, but that isn't necessarily
true on all VLA targets.

Perhaps instead the tests could be classified based on the vector
lengths that they assume.  Then we can iterate through the vector
modes and call the appropriate function based on GET_MODE_NUNITS
and GET_MODE_INNER.

> +  tree_vector_builder builder (vectype, npatterns, nelts_per_pattern);
> +
> +  // Fill a0 for each pattern
> +  for (unsigned i = 0; i < npatterns; i++)
> +    builder.quick_push (build_int_cst (inner_type, rand () % 100));
> +
> +  if (nelts_per_pattern == 1)
> +    return builder.build ();
> +
> +  // Fill a1 for each pattern
> +  for (unsigned i = 0; i < npatterns; i++)
> +    builder.quick_push (build_int_cst (inner_type, rand () % 100));
> +
> +  if (nelts_per_pattern == 2)
> +    return builder.build ();
> +
> +  for (unsigned i = npatterns * 2; i < npatterns * nelts_per_pattern; i++)
> +    {
> +      tree prev_elem = builder[i - npatterns];
> +      int prev_elem_val = TREE_INT_CST_LOW (prev_elem);
> +      int val = prev_elem_val + S;
> +      builder.quick_push (build_int_cst (inner_type, val));
> +    }
> +
> +  return builder.build ();
> +}
> +
> +static void
> +validate_res (unsigned npatterns, unsigned nelts_per_pattern,
> +	      tree res, tree *expected_res)
> +{
> +  ASSERT_TRUE (VECTOR_CST_NPATTERNS (res) == npatterns);
> +  ASSERT_TRUE (VECTOR_CST_NELTS_PER_PATTERN (res) == nelts_per_pattern);

I don't think this is safe when the inputs are randomised.  E.g. we
could by chance end up with a vector of all zeros, which would have
a single pattern and a single element per pattern, regardless of the
shapes of the inputs.

Given the way that vector_builder<T, Shape, Derived>::finalize
canonicalises the encoding, it should be safe to use:

* VECTOR_CST_NPATTERNS (res) <= npatterns
* vector_cst_encoded_nelts (res) <= npatterns * nelts_per_pattern

If we do that then...

> +
> +  for (unsigned i = 0; i < vector_cst_encoded_nelts (res); i++)

...this loop bound should be npatterns * nelts_per_pattern instead.

> +    ASSERT_TRUE (operand_equal_p (VECTOR_CST_ELT (res, i), expected_res[i], 0));
> +}
> +
> +static void
> +validate_res_vls (tree res, tree *expected_res, unsigned expected_nelts)
> +{
> +  ASSERT_TRUE (known_eq (VECTOR_CST_NELTS (res), expected_nelts));
> +  for (unsigned i = 0; i < expected_nelts; i++)
> +    ASSERT_TRUE (operand_equal_p (VECTOR_CST_ELT (res, i), expected_res[i], 0));
> +}
> +
> +/* Verify VLA vec_perm folding.  */
> +
> +static void
> +test_stepped ()
> +{
> +  /* Case 1: sel = {0, 1, 2, ...}
> +     npatterns = 1, nelts_per_pattern = 3
> +     expected res: { arg0[0], arg0[1], arg0[2], ... } */
> +  {
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 1, 3, 2);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 1, 3, 2);
> +    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +    vec_perm_builder builder (arg0_len, 1, 3);
> +    builder.quick_push (0);
> +    builder.quick_push (1);
> +    builder.quick_push (2);
> +
> +    vec_perm_indices sel (builder, 2, arg0_len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg0, 1),
> +			    vector_cst_elt (arg0, 2) };
> +    validate_res (1, 3, res, expected_res);
> +  }
> +
> +  /* Case 2: sel = {len, len + 1, len + 2, ... }
> +     npatterns = 1, nelts_per_pattern = 3
> +     FIXME: This should return
> +     expected res: { op1[0], op1[1], op1[2], ... }
> +     however it returns NULL_TREE.  */

Looks like the comment is out of date.

> +  {
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 1, 3, 2);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 1, 3, 2);
> +    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +    
> +    vec_perm_builder builder (arg0_len, 1, 3);
> +    builder.quick_push (arg0_len);
> +    builder.quick_push (arg0_len + 1);
> +    builder.quick_push (arg0_len + 2);
> +
> +    vec_perm_indices sel (builder, 2, arg0_len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel, NULL, true);
> +    tree expected_res[] = { vector_cst_elt (arg1, 0), vector_cst_elt (arg1, 1),
> +			    vector_cst_elt (arg1, 2) };
> +    validate_res (1, 3, res, expected_res);
> +  }
> +
> +  /* Case 3: Leading element of arg1, stepped sequence: pattern 0 of arg0.
> +     sel = {len, 0, 0, 0, 2, 0, ...}
> +     npatterns = 2, nelts_per_pattern = 3.
> +     Use extra pattern {0, ...} to lower number of elements per pattern.  */
> +  {
> +    tree arg0 = build_vec_cst_rand (char_type_node, 1, 3, 2);
> +    tree arg1 = build_vec_cst_rand (char_type_node, 1, 3, 2);
> +    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +    vec_perm_builder builder (arg0_len, 2, 3);
> +    builder.quick_push (arg0_len);
> +    int mask_elems[] = { 0, 0, 0, 2, 0 };
> +    for (int i = 0; i < 5; i++)
> +      builder.quick_push (mask_elems[i]);

This leaves one of the elements unspecified.

> +
> +    vec_perm_indices sel (builder, 2, arg0_len);
> +    const char *reason; 
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel, &reason);
> +
> +    tree expected_res[] = { vector_cst_elt (arg1, 0), vector_cst_elt (arg0, 0),
> +			    vector_cst_elt (arg0, 0), vector_cst_elt (arg0, 0),
> +			    vector_cst_elt (arg0, 2), vector_cst_elt (arg0, 0)
> +			  };
> +    validate_res (2, 3, res, expected_res);
> +  }
> +
> +  /* Case 4:
> +     sel = { len, 0, 2, ... } npatterns = 1, nelts_per_pattern = 3.
> +     This should return NULL because we cross the input vectors.
> +     Because,
> +     arg0_len = 16 + 16x
> +     a1 = 0
> +     S = 2
> +     esel = arg0_len / npatterns_sel = 16+16x/1 = 16 + 16x
> +     ae = 0 + (esel - 2) * S
> +	= 0 + (16 + 16x - 2) * 2
> +	= 28 + 32x
> +     a1 / arg0_len = 0 /trunc (16 + 16x) = 0
> +     ae / arg0_len = (28 + 32x) /trunc (16 + 16x), which is not defined,
> +     since 28/16 != 32/16.
> +     So return NULL_TREE.  */

The division should succeed now, so as the test says, the reason should
instead be that ae is in the second input.

> +  {
> +    tree arg0 = build_vec_cst_rand (char_type_node, 1, 3, 2);
> +    tree arg1 = build_vec_cst_rand (char_type_node, 1, 3, 2);
> +    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +    vec_perm_builder builder (arg0_len, 1, 3);
> +    builder.quick_push (arg0_len);
> +    builder.quick_push (0);
> +    builder.quick_push (2);
> +
> +    vec_perm_indices sel (builder, 2, arg0_len);
> +    const char *reason;
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel, &reason, false);
> +    gcc_assert (res == NULL_TREE);
> +    gcc_assert (!strcmp (reason, "crossed input vectors"));

The tests should use ASSERT_* macros rather than gcc_assert.

> +  }
> +
> +  /* Case 5: Select elements from different patterns.
> +     Should return NULL.  */
> +  {
> +    tree op0 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> +    tree op1 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> +    poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
> +
> +    vec_perm_builder builder (op0_len, 2, 3);
> +    builder.quick_push (op0_len);
> +    int mask_elems[] = { 0, 0, 0, 1, 0 };
> +    for (int i = 0; i < 5; i++)
> +      builder.quick_push (mask_elems[i]);

Should be 6 elements here too.

> +
> +    vec_perm_indices sel (builder, 2, op0_len);
> +    const char *reason;
> +    tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel, &reason, false);
> +    gcc_assert (res == NULL_TREE);
> +    gcc_assert (!strcmp (reason, "S is not multiple of npatterns"));
> +  }
> +
> +  /* Case 6: Select pattern 0 of op0 and dup of op0[0]
> +     op0, op1, sel: npatterns = 2, nelts_per_pattern = 3
> +     sel = { 0, 0, 2, 0, 4, 0, ... }.
> +
> +     For pattern {0, 2, 4, ...}:
> +     a1 = 2
> +     len = 16 + 16x
> +     S = 2
> +     esel = len / npatterns_sel = (16 + 16x) / 2 = (8 + 8x)
> +     ae = a1 + (esel - 2) * S
> +	= 2 + (8 + 8x - 2) * 2
> +	= 14 + 16x
> +     a1 / arg0_len = 2 / (16 + 16x) = 0
> +     ae / arg0_len = (14 + 16x) / (16 + 16x) = 0
> +     So a1/arg0_len = ae/arg0_len = 0
> +     Hence we select from first vector op0
> +     S = 2, npatterns = 2.
> +     Since S is multiple of npatterns(op0), we are selecting from
> +     same pattern of op0.
> +
> +     For pattern {0, ...}, we are choosing { op0[0] ... }
> +     So res will be combination of above patterns:
> +     res: { op0[0], op0[0], op0[2], op0[0], op0[4], op0[0], ... }
> +     with npatterns = 2, nelts_per_pattern = 3.  */
> +  {
> +    tree op0 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> +    tree op1 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> +    poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
> +
> +    vec_perm_builder builder (op0_len, 2, 3);
> +    int mask_elems[] = { 0, 0, 2, 0, 4, 0 };
> +    for (int i = 0; i < 6; i++)
> +      builder.quick_push (mask_elems[i]);
> +
> +    vec_perm_indices sel (builder, 2, op0_len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel);
> +    tree expected_res[] = { vector_cst_elt (op0, 0), vector_cst_elt (op0, 0),
> +			    vector_cst_elt (op0, 2), vector_cst_elt (op0, 0),
> +			    vector_cst_elt (op0, 4), vector_cst_elt (op0, 0) };
> +    validate_res (2, 3, res, expected_res);
> +  }
> +
> +  /* Case 7: sel_npatterns > op_npatterns;
> +     op0, op1: npatterns = 2, nelts_per_pattern = 3
> +     sel: { 0, 0, 1, len, 2, 0, 3, len, 4, 0, 5, len, ...},
> +     with npatterns = 4, nelts_per_pattern = 3.  */
> +  {
> +    tree op0 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> +    tree op1 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> +    poly_uint64 op0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (op0));
> +
> +    vec_perm_builder builder(op0_len, 4, 3);
> +    // -1 is used as place holder for poly_int_cst
> +    int mask_elems[] = { 0, 0, 1, -1, 2, 0, 3, -1, 4, 0, 5, -1 };
> +    for (int i = 0; i < 12; i++)
> +      builder.quick_push ((mask_elems[i] == -1) ? op0_len : mask_elems[i]);
> +
> +    vec_perm_indices sel (builder, 2, op0_len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (op0), op0, op1, sel);
> +    tree expected_res[] = { vector_cst_elt (op0, 0), vector_cst_elt (op0, 0),
> +			    vector_cst_elt (op0, 1), vector_cst_elt (op1, 0),
> +			    vector_cst_elt (op0, 2), vector_cst_elt (op0, 0),
> +			    vector_cst_elt (op0, 3), vector_cst_elt (op1, 0),
> +			    vector_cst_elt (op0, 4), vector_cst_elt (op0, 0),
> +			    vector_cst_elt (op0, 5), vector_cst_elt (op1, 0) };
> +    validate_res (4, 3, res, expected_res);
> +  }
> +}
> +
> +static void
> +test_dup ()
> +{
> +  /* Case 1: mask = {0, ...} */
> +  {
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +    vec_perm_builder builder (len, 1, 1);
> +    builder.quick_push (0);
> +    vec_perm_indices sel (builder, 2, len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> +
> +    tree expected_res[] = { vector_cst_elt (res, 0) };
> +    validate_res (1, 1, res, expected_res);
> +  }
> +
> +  /* Case 2: mask = {len, ...} */
> +  {
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +    vec_perm_builder builder (len, 1, 1);
> +    builder.quick_push (len);
> +    vec_perm_indices sel (builder, 2, len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> +
> +    tree expected_res[] = { vector_cst_elt (arg1, 0) };
> +    validate_res (1, 1, res, expected_res);
> +  }
> +
> +  /* Case 3: mask = { 0, len, ... } */
> +  {
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +    vec_perm_builder builder (len, 2, 1);
> +    builder.quick_push (0);
> +    builder.quick_push (len);
> +    vec_perm_indices sel (builder, 2, len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> +
> +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0) };
> +    validate_res (2, 1, res, expected_res);
> +  }
> +
> +  /* Case 4: mask = { 0, len, 1, len+1, ... } */
> +  {
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +    vec_perm_builder builder (len, 2, 2);
> +    builder.quick_push (0);
> +    builder.quick_push (len);
> +    builder.quick_push (1);
> +    builder.quick_push (len + 1);
> +    vec_perm_indices sel (builder, 2, len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> +
> +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0),
> +			    vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
> +			  };
> +    validate_res (2, 2, res, expected_res);
> +  }
> +
> +  /* Case 5: mask = { 0, len, 1, len+1, .... }
> +     npatterns = 4, nelts_per_pattern = 1 */
> +  {
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +    vec_perm_builder builder (len, 4, 1);
> +    builder.quick_push (0);
> +    builder.quick_push (len);
> +    builder.quick_push (1);
> +    builder.quick_push (len + 1);
> +    vec_perm_indices sel (builder, 2, len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> +
> +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0),
> +			    vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
> +			  };
> +    validate_res (4, 1, res, expected_res);
> +  }
> +
> +  /* Case 6: mask = {0, 4, ...}
> +     npatterns = 1, nelts_per_pattern = 2.
> +     This should return NULL_TREE because the index 4 may choose
> +     from either arg0 or arg1 depending on vector length.  */
> +  {
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    poly_uint64 len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +    vec_perm_builder builder (len, 1, 2);
> +    builder.quick_push (0);
> +    builder.quick_push (4);
> +    vec_perm_indices sel (builder, 2, len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> +    ASSERT_TRUE (res == NULL_TREE);
> +  }
> +
> +  /* Case 7: npatterns(arg0) = 4 > npatterns(sel) = 2
> +     mask = {0, len, 1, len + 1, ...}
> +     sel_npatterns = 2, sel_nelts_per_pattern = 2.  */

This is a good test to have, but it doesn't seem to match the
name of the containing function (test_dup).

> +  {
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 1);
> +    poly_uint64 arg0_len = TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg0));
> +
> +    vec_perm_builder builder (arg0_len, 2, 2);
> +    builder.quick_push (0);
> +    builder.quick_push (arg0_len);
> +    builder.quick_push (1);
> +    builder.quick_push (arg0_len + 1);
> +    vec_perm_indices sel (builder, 2, arg0_len);
> +    tree res = fold_vec_perm_cst (TREE_TYPE (arg0), arg0, arg1, sel);
> +
> +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0),
> +			    vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
> +			  };
> +    validate_res (2, 2, res, expected_res);
> +  }
> +}
> +
> +static void
> +test_mixed ()
> +{
> +  /* Case 1: op0, op1 -> VLS, sel -> VLA and selects from both input vectors.
> +     In this case, we treat res_npatterns = nelts in input vector
> +     and res_nelts_per_pattern = 1, and create a dup pattern.
> +     sel = { 0, 4, 1, 5, ... }
> +     res = { op0[0], op1[0], op0[1], op1[1], ...} // (4, 1)
> +     res_npatterns = 4, res_nelts_per_pattern = 1.  */
> +  {
> +    tree arg_vectype = build_vector_type (integer_type_node, 4);
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 4, 1, 0, arg_vectype);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 4, 1, 0, arg_vectype);
> +
> +    tree res_type = get_preferred_vectype (integer_type_node);
> +    poly_uint64 res_len = TYPE_VECTOR_SUBPARTS (res_type);
> +    vec_perm_builder builder (res_len, 4, 1);
> +    builder.quick_push (0);
> +    builder.quick_push (4);
> +    builder.quick_push (1);
> +    builder.quick_push (5);
> +
> +    vec_perm_indices sel (builder, 2, res_len);
> +    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel);
> +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg1, 0),
> +			    vector_cst_elt (arg0, 1), vector_cst_elt (arg1, 1)
> +			  };
> +    validate_res (4, 1, res, expected_res);
> +  }
> +
> +  /* Case 2: Same as Case 1, but sel contains an out of bounds index.
> +     result should be NULL_TREE.  */
> +  {
> +    tree arg_vectype = build_vector_type (integer_type_node, 4);
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 4, 1, 0, arg_vectype);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 4, 1, 0, arg_vectype);
> +
> +    tree res_type = get_preferred_vectype (integer_type_node);
> +    poly_uint64 res_len = TYPE_VECTOR_SUBPARTS (res_type);
> +    vec_perm_builder builder (res_len, 4, 1);
> +    builder.quick_push (0);
> +    builder.quick_push (8);
> +    builder.quick_push (1);
> +    builder.quick_push (5);
> +
> +    vec_perm_indices sel (builder, 2, res_len);
> +    const char *reason; 
> +    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel, &reason);
> +    gcc_assert (res == NULL_TREE);
> +    gcc_assert (!strcmp (reason, "out of bounds access"));
> +  }
> +
> +  /* Case 3: op0, op1 are VLA and sel is VLS.
> +     op0, op1: VNx16QI with shape (2, 3)
> +     sel = V4SI with values {0, 2, 4, 6}
> +     res: V4SI with values { op0[0], op0[2], op0[4], op0[6] }.  */
> +  {
> +    tree arg0 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> +    tree arg1 = build_vec_cst_rand (char_type_node, 2, 3, 2);
> +
> +    poly_uint64 res_len = 4;
> +    tree res_type = build_vector_type (char_type_node, res_len);
> +    vec_perm_builder builder (res_len, 4, 1);
> +    builder.quick_push (0);
> +    builder.quick_push (2);
> +    builder.quick_push (4);
> +    builder.quick_push (6);
> +
> +    vec_perm_indices sel (builder, 2, res_len);
> +    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel);
> +
> +    tree expected_res[] = { vector_cst_elt (arg0, 0), vector_cst_elt (arg0, 2),
> +			    vector_cst_elt (arg0, 4), vector_cst_elt (arg0, 6)
> +			  };
> +    validate_res_vls (res, expected_res, 4);
> +  }
> +
> +  /* Case 4: Same as case 4, but op0, op1 are VNx4SI with shape (2, 3) and step = 2

Same as case 3?

Thanks,
Richard

> +     sel = V4SI with values {0, 2, 4, 6}
> +     In this case result should be NULL_TREE because we cross input vector
> +     boundary at index 4.  */
> +  {
> +    tree arg0 = build_vec_cst_rand (integer_type_node, 2, 3, 2);
> +    tree arg1 = build_vec_cst_rand (integer_type_node, 2, 3, 2);
> +
> +    poly_uint64 res_len = 4;
> +    tree res_type = build_vector_type (char_type_node, res_len);
> +    vec_perm_builder builder (res_len, 4, 1);
> +    builder.quick_push (0);
> +    builder.quick_push (2);
> +    builder.quick_push (4);
> +    builder.quick_push (6);
> +
> +    vec_perm_indices sel (builder, 2, res_len);
> +    const char *reason;
> +    tree res = fold_vec_perm_cst (res_type, arg0, arg1, sel, &reason);
> +    gcc_assert (res == NULL_TREE);
> +    gcc_assert (!strcmp (reason, "cannot divide selector element by arg len"));
> +  }
> +}
> +
> +static void
> +test ()
> +{
> +  tree vectype = get_preferred_vectype (integer_type_node);
> +  if (TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
> +    return;
> +
> +  test_dup ();
> +  test_stepped ();
> +  test_mixed ();
> +}
> +};
> +
>  /* Verify that various binary operations on vectors are folded
>     correctly.  */
>  
> @@ -16943,6 +17626,7 @@ fold_const_cc_tests ()
>    test_arithmetic_folding ();
>    test_vector_folding ();
>    test_vec_duplicate_folding ();
> +  test_fold_vec_perm_cst::test ();
>  }
>  
>  } // namespace selftest