From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <richard.sandiford@arm.com>
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
 by sourceware.org (Postfix) with ESMTP id A39B23858296
 for <gcc-patches@gcc.gnu.org>; Mon,  6 Jun 2022 10:59:13 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org A39B23858296
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
 by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id 474B123A;
 Mon,  6 Jun 2022 03:59:13 -0700 (PDT)
Received: from localhost (e121540-lin.manchester.arm.com [10.32.98.37])
 by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id B52CB3F73B;
 Mon,  6 Jun 2022 03:59:12 -0700 (PDT)
From: Richard Sandiford <richard.sandiford@arm.com>
To: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>
Mail-Followup-To: Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org>,
 gcc Patches <gcc-patches@gcc.gnu.org>, richard.sandiford@arm.com
Cc: gcc Patches <gcc-patches@gcc.gnu.org>
Subject: Re: [1/2] PR96463 - aarch64 specific changes
References: <CAAgBjM=meq6r-Yi3BJ6WWQ_eQHqV-u7nyN8NFmnxGefmeWp7hg@mail.gmail.com>
 <mpty24j8nzl.fsf@arm.com>
 <CAAgBjMk5qpnMqpAmLa7L5QGUgqOoLuP8z2DsTi_xkwWKCAKS9g@mail.gmail.com>
 <CAAgBjM=PQOghYkGg_FWjjp4RHT0UbJ_J0RA6_q0gN7jY5hNmrA@mail.gmail.com>
 <mptczgrar7i.fsf@arm.com>
 <CAAgBjMnzBPvJqud-cEBD9g7RKh3NNHx6fn649R1sBoKf9RN+wg@mail.gmail.com>
 <mpt8rr87d7c.fsf@arm.com>
 <CAAgBjM=pyXy4szROys3_KBY69fF-1sPtrsKcurh2eiwFsUNtew@mail.gmail.com>
 <mpttu9v3u8k.fsf@arm.com>
 <CAAgBjMms_Tqhn8ihJYDeCkaHviuFexFF8vxCQ3pyGf0wvSVN0Q@mail.gmail.com>
 <mpty1yglqpr.fsf@arm.com>
 <CAAgBjMko1eULr=JeKXA2ke6G4rHJ8x_e2UaY1EEUih1VDojATA@mail.gmail.com>
Date: Mon, 06 Jun 2022 11:59:11 +0100
In-Reply-To: <CAAgBjMko1eULr=JeKXA2ke6G4rHJ8x_e2UaY1EEUih1VDojATA@mail.gmail.com>
 (Prathamesh Kulkarni's message of "Sun, 5 Jun 2022 15:45:13 +0530")
Message-ID: <mpt4k0ykqgw.fsf@arm.com>
User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux)
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
X-Spam-Status: No, score=-60.1 required=5.0 tests=BAYES_00, GIT_PATCH_0,
 KAM_DMARC_NONE, KAM_DMARC_STATUS, KAM_LAZY_DOMAIN_SECURITY, SPF_HELO_NONE,
 SPF_NONE, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Mon, 06 Jun 2022 10:59:16 -0000

Prathamesh Kulkarni <prathamesh.kulkarni@linaro.org> writes:
>> >  {
>> >    /* The pattern matching functions above are written to look for a s=
mall
>> >       number to begin the sequence (0, 1, N/2).  If we begin with an i=
ndex
>> > @@ -24084,6 +24112,12 @@ aarch64_expand_vec_perm_const_1 (struct expan=
d_vec_perm_d *d)
>> >         || d->vec_flags =3D=3D VEC_SVE_PRED)
>> >        && known_gt (nelt, 1))
>> >      {
>> > +      /* If operand and result modes differ, then only check
>> > +      for dup case.  */
>> > +      if (d->vmode !=3D op_mode)
>> > +     return (d->vec_flags =3D=3D VEC_SVE_DATA)
>> > +             ? aarch64_evpc_sve_dup (d, op_mode) : false;
>> > +
>>
>> I think it'd be more future-proof to format this as:
>>
>>     if (d->vmod =3D=3D d->op_mode)
>>       {
>>         =E2=80=A6existing code=E2=80=A6
>>       }
>>     else
>>       {
>>         if (aarch64_evpc_sve_dup (d))
>>           return true;
>>       }
>>
>> with the d->vec_flags =3D=3D VEC_SVE_DATA check being in aarch64_evpc_sv=
e_dup,
>> alongside the op_mode check.  I think we'll be adding more checks here
>> over time.
> Um I was wondering if we should structure it as:
> if (d->vmode =3D=3D d->op_mode)
>   {
>      ...existing code...
>   }
> if (aarch64_evpc_sve_dup (d))
>   return true;
>
> So we check for dup irrespective of  d->vmode =3D=3D d->op_mode ?

Yeah, I can see the attraction of that.  I think the else is better
though because the fallback TBL handling will (rightly) come at the end
of the existing code.  Without the else, we'd have specific tests like
DUP after generic ones like TBL, so the reader would have to work out
for themselves that DUP and TBL handle disjoint cases.

>> >        if (aarch64_evpc_rev_local (d))
>> >       return true;
>> >        else if (aarch64_evpc_rev_global (d))
>> > @@ -24105,7 +24139,12 @@ aarch64_expand_vec_perm_const_1 (struct expan=
d_vec_perm_d *d)
>> >        else if (aarch64_evpc_reencode (d))
>> >       return true;
>> >        if (d->vec_flags =3D=3D VEC_SVE_DATA)
>> > -     return aarch64_evpc_sve_tbl (d);
>> > +     {
>> > +       if (aarch64_evpc_sve_tbl (d))
>> > +         return true;
>> > +       else if (aarch64_evpc_sve_dup (d, op_mode))
>> > +         return true;
>> > +     }
>> >        else if (d->vec_flags =3D=3D VEC_ADVSIMD)
>> >       return aarch64_evpc_tbl (d);
>> >      }
>>
>> Is this part still needed, given the above?
>>
>> Thanks,
>> Richard
>>
>> > @@ -24119,9 +24158,6 @@ aarch64_vectorize_vec_perm_const (machine_mode=
 vmode, machine_mode op_mode,
>> >                                 rtx target, rtx op0, rtx op1,
>> >                                 const vec_perm_indices &sel)
>> >  {
>> > -  if (vmode !=3D op_mode)
>> > -    return false;
>> > -
>> >    struct expand_vec_perm_d d;
>> >
>> >    /* Check whether the mask can be applied to a single vector.  */
>> > @@ -24154,10 +24190,10 @@ aarch64_vectorize_vec_perm_const (machine_mo=
de vmode, machine_mode op_mode,
>> >    d.testing_p =3D !target;
>> >
>> >    if (!d.testing_p)
>> > -    return aarch64_expand_vec_perm_const_1 (&d);
>> > +    return aarch64_expand_vec_perm_const_1 (&d, op_mode);
>> >
>> >    rtx_insn *last =3D get_last_insn ();
>> > -  bool ret =3D aarch64_expand_vec_perm_const_1 (&d);
>> > +  bool ret =3D aarch64_expand_vec_perm_const_1 (&d, op_mode);
>> >    gcc_assert (last =3D=3D get_last_insn ());
>> >
>> >    return ret;
>
> diff --git a/gcc/config/aarch64/aarch64-sve-builtins-base.cc b/gcc/config=
/aarch64/aarch64-sve-builtins-base.cc
> index bee410929bd..1a804b1ab73 100644
> --- a/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> +++ b/gcc/config/aarch64/aarch64-sve-builtins-base.cc
> @@ -44,6 +44,7 @@
>  #include "aarch64-sve-builtins-shapes.h"
>  #include "aarch64-sve-builtins-base.h"
>  #include "aarch64-sve-builtins-functions.h"
> +#include "ssa.h"
>=20=20
>  using namespace aarch64_sve;
>=20=20
> @@ -1207,6 +1208,64 @@ public:
>      insn_code icode =3D code_for_aarch64_sve_ld1rq (e.vector_mode (0));
>      return e.use_contiguous_load_insn (icode);
>    }
> +
> +  gimple *
> +  fold (gimple_folder &f) const override
> +  {
> +    tree arg0 =3D gimple_call_arg (f.call, 0);
> +    tree arg1 =3D gimple_call_arg (f.call, 1);
> +
> +    /* Transform:
> +       lhs =3D svld1rq ({-1, -1, ... }, arg1)
> +       into:
> +       tmp =3D mem_ref<vectype> [(int * {ref-all}) arg1]
> +       lhs =3D vec_perm_expr<tmp, tmp, {0, 1, 2, 3, ...}>.
> +       on little endian target.
> +       vectype is the corresponding ADVSIMD type.  */
> +
> +    if (!BYTES_BIG_ENDIAN
> +	&& integer_all_onesp (arg0))
> +      {
> +	tree lhs =3D gimple_call_lhs (f.call);
> +	tree lhs_type =3D TREE_TYPE (lhs);
> +	poly_uint64 lhs_len =3D TYPE_VECTOR_SUBPARTS (lhs_type);
> +	tree eltype =3D TREE_TYPE (lhs_type);
> +
> +	scalar_mode elmode =3D GET_MODE_INNER (TYPE_MODE (lhs_type));
> +	machine_mode vq_mode =3D aarch64_vq_mode (elmode).require ();
> +	tree vectype =3D build_vector_type_for_mode (eltype, vq_mode);
> +
> +	tree elt_ptr_type
> +	  =3D build_pointer_type_for_mode (eltype, VOIDmode, true);
> +	tree zero =3D build_zero_cst (elt_ptr_type);
> +
> +	/* Use element type alignment.  */
> +	tree access_type
> +	  =3D build_aligned_type (vectype, TYPE_ALIGN (eltype));
> +
> +	tree mem_ref_lhs =3D make_ssa_name_fn (cfun, access_type, 0);
> +	tree mem_ref_op =3D fold_build2 (MEM_REF, access_type, arg1, zero);
> +	gimple *mem_ref_stmt
> +	  =3D gimple_build_assign (mem_ref_lhs, mem_ref_op);
> +	gsi_insert_before (f.gsi, mem_ref_stmt, GSI_SAME_STMT);
> +
> +	int source_nelts =3D TYPE_VECTOR_SUBPARTS (access_type).to_constant ();
> +	vec_perm_builder sel (lhs_len, source_nelts, 1);
> +	for (int i =3D 0; i < source_nelts; i++)
> +	  sel.quick_push (i);
> +
> +	vec_perm_indices indices (sel, 1, source_nelts);
> +	gcc_checking_assert (can_vec_perm_const_p (TYPE_MODE (lhs_type),
> +						   TYPE_MODE (access_type),
> +						   indices));
> +	tree mask_type =3D build_vector_type (ssizetype, lhs_len);
> +	tree mask =3D vec_perm_indices_to_tree (mask_type, indices);
> +	return gimple_build_assign (lhs, VEC_PERM_EXPR,
> +				    mem_ref_lhs, mem_ref_lhs, mask);
> +      }
> +
> +    return NULL;
> +  }
>  };
>=20=20
>  class svld1ro_impl : public load_replicate
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index d4c575ce976..bb24701b0d2 100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -23395,8 +23395,10 @@ struct expand_vec_perm_d
>  {
>    rtx target, op0, op1;
>    vec_perm_indices perm;
> +  machine_mode op_mode;
>    machine_mode vmode;
>    unsigned int vec_flags;
> +  unsigned int op_vec_flags;

Very minor, but it would be good to keep the order consistent:
output mode first or input mode first.  Guess it might as well
be output mode first, to match the hook:

  machine_mode vmode;
  machine_mode op_mode;
  unsigned int vec_flags;
  unsigned int op_vec_flags;

>    bool one_vector_p;
>    bool testing_p;
>  };
> @@ -23945,6 +23947,32 @@ aarch64_evpc_sve_tbl (struct expand_vec_perm_d *=
d)
>    return true;
>  }
>=20=20
> +/* Try to implement D using SVE dup instruction.  */
> +
> +static bool
> +aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
> +{
> +  if (BYTES_BIG_ENDIAN
> +      || !d->one_vector_p
> +      || d->vec_flags !=3D VEC_SVE_DATA
> +      || d->op_vec_flags !=3D VEC_ADVSIMD

Sorry, one more: DUPQ only handles 128-bit AdvSIMD modes, so we also need:

      || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128)

This isn't redundant with any of the other tests.

(We can use DUP .D for 64-bit input vectors, but that's a separate patch.)

OK with those changes (including using "else" :-)), thanks.

Richard

> +      || d->perm.encoding ().nelts_per_pattern () !=3D 1
> +      || !known_eq (d->perm.encoding ().npatterns (),
> +		    GET_MODE_NUNITS (d->op_mode)))
> +    return false;
> +
> +  int npatterns =3D d->perm.encoding ().npatterns ();
> +  for (int i =3D 0; i < npatterns; i++)
> +    if (!known_eq (d->perm[i], i))
> +      return false;
> +
> +  if (d->testing_p)
> +    return true;
> +
> +  aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
> +  return true;
> +}
> +
>  /* Try to implement D using SVE SEL instruction.  */
>=20=20
>  static bool
> @@ -24084,30 +24112,39 @@ aarch64_expand_vec_perm_const_1 (struct expand_=
vec_perm_d *d)
>         || d->vec_flags =3D=3D VEC_SVE_PRED)
>        && known_gt (nelt, 1))
>      {
> -      if (aarch64_evpc_rev_local (d))
> -	return true;
> -      else if (aarch64_evpc_rev_global (d))
> -	return true;
> -      else if (aarch64_evpc_ext (d))
> -	return true;
> -      else if (aarch64_evpc_dup (d))
> -	return true;
> -      else if (aarch64_evpc_zip (d))
> -	return true;
> -      else if (aarch64_evpc_uzp (d))
> -	return true;
> -      else if (aarch64_evpc_trn (d))
> -	return true;
> -      else if (aarch64_evpc_sel (d))
> -	return true;
> -      else if (aarch64_evpc_ins (d))
> -	return true;
> -      else if (aarch64_evpc_reencode (d))
> +      /* If operand and result modes differ, then only check
> +	 for dup case.  */
> +      if (d->vmode =3D=3D d->op_mode)
> +	{
> +	  if (aarch64_evpc_rev_local (d))
> +	    return true;
> +	  else if (aarch64_evpc_rev_global (d))
> +	    return true;
> +	  else if (aarch64_evpc_ext (d))
> +	    return true;
> +	  else if (aarch64_evpc_dup (d))
> +	    return true;
> +	  else if (aarch64_evpc_zip (d))
> +	    return true;
> +	  else if (aarch64_evpc_uzp (d))
> +	    return true;
> +	  else if (aarch64_evpc_trn (d))
> +	    return true;
> +	  else if (aarch64_evpc_sel (d))
> +	    return true;
> +	  else if (aarch64_evpc_ins (d))
> +	    return true;
> +	  else if (aarch64_evpc_reencode (d))
> +	    return true;
> +
> +	  if (d->vec_flags =3D=3D VEC_SVE_DATA)
> +	    return aarch64_evpc_sve_tbl (d);
> +	  else if (d->vec_flags =3D=3D VEC_ADVSIMD)
> +	    return aarch64_evpc_tbl (d);
> +	}
> +
> +      if (aarch64_evpc_sve_dup (d))
>  	return true;
> -      if (d->vec_flags =3D=3D VEC_SVE_DATA)
> -	return aarch64_evpc_sve_tbl (d);
> -      else if (d->vec_flags =3D=3D VEC_ADVSIMD)
> -	return aarch64_evpc_tbl (d);
>      }
>    return false;
>  }
> @@ -24119,9 +24156,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vm=
ode, machine_mode op_mode,
>  				  rtx target, rtx op0, rtx op1,
>  				  const vec_perm_indices &sel)
>  {
> -  if (vmode !=3D op_mode)
> -    return false;
> -
>    struct expand_vec_perm_d d;
>=20=20
>    /* Check whether the mask can be applied to a single vector.  */
> @@ -24145,6 +24179,8 @@ aarch64_vectorize_vec_perm_const (machine_mode vm=
ode, machine_mode op_mode,
>  		     sel.nelts_per_input ());
>    d.vmode =3D vmode;
>    d.vec_flags =3D aarch64_classify_vector_mode (d.vmode);
> +  d.op_mode =3D op_mode;
> +  d.op_vec_flags =3D aarch64_classify_vector_mode (d.op_mode);
>    d.target =3D target;
>    d.op0 =3D op0 ? force_reg (vmode, op0) : NULL_RTX;
>    if (op0 =3D=3D op1)