From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <richard.sandiford@arm.com>
Received: from foss.arm.com (foss.arm.com [217.140.110.172])
 by sourceware.org (Postfix) with ESMTP id 5D1CC3858D39
 for <gcc-patches@gcc.gnu.org>; Tue, 26 Oct 2021 14:46:08 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 5D1CC3858D39
Received: from usa-sjc-imap-foss1.foss.arm.com (unknown [10.121.207.14])
 by usa-sjc-mx-foss1.foss.arm.com (Postfix) with ESMTP id E0DAF1063;
 Tue, 26 Oct 2021 07:46:07 -0700 (PDT)
Received: from localhost (e121540-lin.manchester.arm.com [10.32.98.88])
 by usa-sjc-imap-foss1.foss.arm.com (Postfix) with ESMTPSA id 138543F73D;
 Tue, 26 Oct 2021 07:46:06 -0700 (PDT)
From: Richard Sandiford <richard.sandiford@arm.com>
To: Tamar Christina <Tamar.Christina@arm.com>
Mail-Followup-To: Tamar Christina <Tamar.Christina@arm.com>,
 Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>,
 Richard Earnshaw <Richard.Earnshaw@arm.com>, nd <nd@arm.com>,
 Marcus Shawcroft <Marcus.Shawcroft@arm.com>, richard.sandiford@arm.com
Cc: Tamar Christina via Gcc-patches <gcc-patches@gcc.gnu.org>,
 Richard Earnshaw <Richard.Earnshaw@arm.com>, nd <nd@arm.com>,
 Marcus Shawcroft <Marcus.Shawcroft@arm.com>
Subject: Re: [PATCH 2/2]AArch64: Add better costing for vector constants and
 operations
References: <patch-14774-tamar@arm.com> <mptsfyplk9a.fsf@arm.com>
 <VI1PR08MB53259D8B7FC65B7F193B1589FFCC9@VI1PR08MB5325.eurprd08.prod.outlook.com>
 <mpt5yvllhsd.fsf@arm.com>
 <VI1PR08MB5325A025B1855D206116FF0AFFCC9@VI1PR08MB5325.eurprd08.prod.outlook.com>
 <mptlf4hjw8z.fsf@arm.com>
 <VI1PR08MB532531694F553DEE5D5A689AFFD49@VI1PR08MB5325.eurprd08.prod.outlook.com>
 <mptmtn0t47h.fsf@arm.com>
 <VI1PR08MB5325E3178ED035CCDF1856B0FF819@VI1PR08MB5325.eurprd08.prod.outlook.com>
 <mpt35opsa48.fsf@arm.com>
 <VI1PR08MB532590DD0277934B18583B07FF839@VI1PR08MB5325.eurprd08.prod.outlook.com>
 <mptpmrtmayv.fsf@arm.com>
 <VI1PR08MB532599487C521DE56A1F3F3EFF849@VI1PR08MB5325.eurprd08.prod.outlook.com>
Date: Tue, 26 Oct 2021 15:46:05 +0100
In-Reply-To: <VI1PR08MB532599487C521DE56A1F3F3EFF849@VI1PR08MB5325.eurprd08.prod.outlook.com>
 (Tamar Christina's message of "Tue, 26 Oct 2021 13:01:21 +0000")
Message-ID: <mptfssnkfo2.fsf@arm.com>
User-Agent: Gnus/5.13 (Gnus v5.13) Emacs/26.3 (gnu/linux)
MIME-Version: 1.0
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
X-Spam-Status: No, score=-12.2 required=5.0 tests=BAYES_00, GIT_PATCH_0,
 KAM_DMARC_STATUS, KAM_LOTSOFHASH, KAM_SHORT, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.4
X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Tue, 26 Oct 2021 14:46:10 -0000

Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi,
>
> Following the discussion below here's a revised patch.
>
> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>
> Ok for master?

Looks good functionally, just got some comments about the implementation.

> @@ -14006,8 +14007,52 @@ cost_plus:
>  			     mode, MULT, 1, speed);
>            return true;
>          }
> +	break;
> +    case CONST_VECTOR:
> +	{
> +	  rtx gen_insn =3D aarch64_simd_make_constant (x, true);
> +	  /* Not a valid const vector.  */
> +	  if (!gen_insn)
> +	    break;
>=20=20
> -      /* Fall through.  */
> +	  switch (GET_CODE (gen_insn))
> +	  {
> +	  case CONST_VECTOR:
> +	    /* Load using MOVI/MVNI.  */
> +	    if (aarch64_simd_valid_immediate (x, NULL))
> +	      *cost +=3D extra_cost->vect.movi;
> +	    else /* Load using constant pool.  */
> +	      *cost +=3D extra_cost->ldst.load;
> +	    break;
> +	  /* Load using a DUP.  */
> +	  case VEC_DUPLICATE:
> +	    gcc_unreachable ();
> +	    break;
> +	  default:
> +	    *cost +=3D extra_cost->ldst.load;
> +	    break;
> +	  }
> +	  return true;
> +	}

This might be a problem (if it is a problem) with some of the existing
cases too, but: is using +=3D rather than =3D the right behaviour here?
It maens that we add our cost on top of whatever the target-independent
rtx_costs thought was a good default choice, whereas it looks like
these table entries specify the correct full cost.

If it's not clear-cut, then I think using =3D would be better.

Also, going back to an earlier part of the thread, I think the =E2=80=9Cinn=
er=E2=80=9D
CONST_VECTOR case is now a correct replacement for the =E2=80=9Couter=E2=80=
=9D
CONST_VECTOR case, meaning we don't need the aarch64_simd_make_constant
bits.  I.e. I think we can make the top-level case:

    case CONST_VECTOR:
      /* Load using MOVI/MVNI.  */
      if (aarch64_simd_valid_immediate (x, NULL))
        *cost =3D extra_cost->vect.movi;
      else /* Load using constant pool.  */
        *cost =3D extra_cost->ldst.load;
      break;

> +    case VEC_CONCAT:
> +	/* depending on the operation, either DUP or INS.
> +	   For now, keep default costing.  */
> +	break;
> +    case VEC_DUPLICATE:
> +	*cost +=3D extra_cost->vect.dup;
> +	return true;

For this I think we should do:

  *cost =3D extra_cost->vect.dup;
  return false;

so that we cost the operand of the vec_duplicate as well.
This will have no effect if the operand is a REG, but would
affect more complex expressions.

> +    case VEC_SELECT:
> +	{

Here I think we should recurse on operand 0:

	  rtx op0 =3D XEXP (x, 0);
	  *cost =3D rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);

> +	  /* cost subreg of 0 as free, otherwise as DUP */
> +	  rtx op1 =3D XEXP (x, 1);
> +	  if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
> +	    ;
> +	  else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
> +	    *cost +=3D extra_cost->vect.dup;
> +	  else
> +	    *cost +=3D extra_cost->vect.extract;
> +	  return true;
> +	}
>      default:
>        break;
>      }
> @@ -20654,9 +20699,12 @@ aarch64_builtin_support_vector_misalignment (mac=
hine_mode mode,
>=20=20
>  /* If VALS is a vector constant that can be loaded into a register
>     using DUP, generate instructions to do so and return an RTX to
> -   assign to the register.  Otherwise return NULL_RTX.  */
> +   assign to the register.  Otherwise return NULL_RTX.
> +
> +   If CHECK then the resulting instruction may not be used in
> +   codegen but can be used for costing.  */
>  static rtx
> -aarch64_simd_dup_constant (rtx vals)
> +aarch64_simd_dup_constant (rtx vals, bool check =3D false)
>  {
>    machine_mode mode =3D GET_MODE (vals);
>    machine_mode inner_mode =3D GET_MODE_INNER (mode);
> @@ -20668,7 +20716,8 @@ aarch64_simd_dup_constant (rtx vals)
>    /* We can load this constant by using DUP and a constant in a
>       single ARM register.  This will be cheaper than a vector
>       load.  */
> -  x =3D copy_to_mode_reg (inner_mode, x);
> +  if (!check)
> +    x =3D copy_to_mode_reg (inner_mode, x);
>    return gen_vec_duplicate (mode, x);
>  }
>=20=20
> @@ -20676,9 +20725,12 @@ aarch64_simd_dup_constant (rtx vals)
>  /* Generate code to load VALS, which is a PARALLEL containing only
>     constants (for vec_init) or CONST_VECTOR, efficiently into a
>     register.  Returns an RTX to copy into the register, or NULL_RTX
> -   for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
> +   for a PARALLEL that cannot be converted into a CONST_VECTOR.
> +
> +   If CHECK then the resulting instruction may not be used in
> +   codegen but can be used for costing.  */
>  static rtx
> -aarch64_simd_make_constant (rtx vals)
> +aarch64_simd_make_constant (rtx vals, bool check =3D false)
>  {
>    machine_mode mode =3D GET_MODE (vals);
>    rtx const_dup;
> @@ -20710,7 +20762,7 @@ aarch64_simd_make_constant (rtx vals)
>        && aarch64_simd_valid_immediate (const_vec, NULL))
>      /* Load using MOVI/MVNI.  */
>      return const_vec;
> -  else if ((const_dup =3D aarch64_simd_dup_constant (vals)) !=3D NULL_RT=
X)
> +  else if ((const_dup =3D aarch64_simd_dup_constant (vals, check)) !=3D =
NULL_RTX)
>      /* Loaded using DUP.  */
>      return const_dup;
>    else if (const_vec !=3D NULL_RTX)

With the inner CONST_VECTOR case replacing the outer one, I think we can
drop the aarch64_simd_dup_constant and aarch64_simd_make_constant bits.

> diff --git a/gcc/config/arm/aarch-common-protos.h b/gcc/config/arm/aarch-=
common-protos.h
> index 6be5fb1e083d7ff130386dfa181b9a0c8fd5437c..55a470d8e1410bdbcfbea084e=
c11b468485c1400 100644
> --- a/gcc/config/arm/aarch-common-protos.h
> +++ b/gcc/config/arm/aarch-common-protos.h
> @@ -133,6 +133,9 @@ struct vector_cost_table
>  {
>    const int alu;
>    const int mult;
> +  const int movi;
> +  const int dup;
> +  const int extract;
>  };
>=20=20
>  struct cpu_cost_table
> diff --git a/gcc/config/arm/aarch-cost-tables.h b/gcc/config/arm/aarch-co=
st-tables.h
> index 25ff702f01fab50d749b9a7b7b072c2be2504562..0e6a62665c7e18debc382a294=
a37945188fb90ef 100644
> --- a/gcc/config/arm/aarch-cost-tables.h
> +++ b/gcc/config/arm/aarch-cost-tables.h
> @@ -122,7 +122,10 @@ const struct cpu_cost_table generic_extra_costs =3D
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),	/* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>=20=20
> @@ -226,7 +229,10 @@ const struct cpu_cost_table cortexa53_extra_costs =3D
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),	/* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>=20=20
> @@ -330,7 +336,10 @@ const struct cpu_cost_table cortexa57_extra_costs =3D
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>=20=20
> @@ -434,7 +443,10 @@ const struct cpu_cost_table cortexa76_extra_costs =3D
>    /* Vector */
>    {
>      COSTS_N_INSNS (1),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>=20=20
> @@ -538,7 +550,10 @@ const struct cpu_cost_table exynosm1_extra_costs =3D
>    /* Vector */
>    {
>      COSTS_N_INSNS (0),  /* alu.  */
> -    COSTS_N_INSNS (4)   /* mult.  */
> +    COSTS_N_INSNS (4),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>=20=20
> @@ -642,7 +657,10 @@ const struct cpu_cost_table xgene1_extra_costs =3D
>    /* Vector */
>    {
>      COSTS_N_INSNS (2),  /* alu.  */
> -    COSTS_N_INSNS (8)   /* mult.  */
> +    COSTS_N_INSNS (8),  /* mult.  */
> +    COSTS_N_INSNS (1),  /* movi.  */
> +    COSTS_N_INSNS (2),  /* dup.  */
> +    COSTS_N_INSNS (2)   /* extract.  */
>    }
>  };
>=20=20
> diff --git a/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c b/gcc/te=
stsuite/gcc.target/aarch64/vect-cse-codegen.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..f9edcda13d27bb3463da5b017=
0cfda7f41655b3c
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vect-cse-codegen.c
> @@ -0,0 +1,97 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -march=3Darmv8.2-a+crypto -fno-schedule-=
insns -fno-schedule-insns2 -mcmodel=3Dsmall" } */
> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */

Could you try this with -mabi=3Dilp32?  It looks like it might fail.
Skipping it is OK if so.

OK with those changes, if they work.

Thanks,
Richard

> +
> +#include <arm_neon.h>
> +
> +/*
> +**test1:
> +**	adrp	x[0-9]+, .LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +**	str	q[0-9]+, \[x[0-9]+\]
> +**	fmov	x[0-9]+, d[0-9]+
> +**	orr	x[0-9]+, x[0-9]+, x[0-9]+
> +**	ret
> +*/
> +
> +uint64_t
> +test1 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
> +{
> +  uint64_t arr[2] =3D { 0x0942430810234076UL, 0x0942430810234076UL};
> +  uint64_t res =3D a | arr[0];
> +  uint64x2_t val =3D vld1q_u64 (arr);
> +  *rt =3D vaddq_u64 (val, b);
> +  return res;
> +}
> +
> +/*
> +**test2:
> +**	adrp	x[0-9]+, .LC[0-1]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	add	v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
> +**	str	q[0-9]+, \[x[0-9]+\]
> +**	fmov	x[0-9]+, d[0-9]+
> +**	orr	x[0-9]+, x[0-9]+, x[0-9]+
> +**	ret
> +*/
> +
> +uint64_t
> +test2 (uint64_t a, uint64x2_t b, uint64x2_t* rt)
> +{
> +  uint64x2_t val =3D vdupq_n_u64 (0x0424303242234076UL);
> +  uint64_t arr =3D vgetq_lane_u64 (val, 0);
> +  uint64_t res =3D a | arr;
> +  *rt =3D vaddq_u64 (val, b);
> +  return res;
> +}
> +
> +/*
> +**test3:
> +**	adrp	x[0-9]+, .LC[0-9]+
> +**	ldr	q[0-9]+, \[x[0-9]+, #:lo12:.LC[0-9]+\]
> +**	add	v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
> +**	str	q[0-9]+, \[x1\]
> +**	fmov	w[0-9]+, s[0-9]+
> +**	orr	w[0-9]+, w[0-9]+, w[0-9]+
> +**	ret
> +*/
> +
> +uint32_t
> +test3 (uint32_t a, uint32x4_t b, uint32x4_t* rt)
> +{
> +  uint32_t arr[4] =3D { 0x094243, 0x094243, 0x094243, 0x094243 };
> +  uint32_t res =3D a | arr[0];
> +  uint32x4_t val =3D vld1q_u32 (arr);
> +  *rt =3D vaddq_u32 (val, b);
> +  return res;
> +}
> +
> +/*
> +**test4:
> +**	ushr	v[0-9]+.16b, v[0-9]+.16b, 7
> +**	mov	x[0-9]+, 16512
> +**	movk	x[0-9]+, 0x1020, lsl 16
> +**	movk	x[0-9]+, 0x408, lsl 32
> +**	movk	x[0-9]+, 0x102, lsl 48
> +**	fmov	d[0-9]+, x[0-9]+
> +**	pmull	v[0-9]+.1q, v[0-9]+.1d, v[0-9]+.1d
> +**	dup	v[0-9]+.2d, v[0-9]+.d\[0\]
> +**	pmull2	v[0-9]+.1q, v[0-9]+.2d, v[0-9]+.2d
> +**	trn2	v[0-9]+.8b, v[0-9]+.8b, v[0-9]+.8b
> +**	umov	w[0-9]+, v[0-9]+.h\[3\]
> +**	ret
> +*/
> +
> +uint64_t
> +test4 (uint8x16_t input)
> +{
> +    uint8x16_t bool_input =3D vshrq_n_u8(input, 7);
> +    poly64x2_t mask =3D vdupq_n_p64(0x0102040810204080UL);
> +    poly64_t prodL =3D vmull_p64((poly64_t)vgetq_lane_p64((poly64x2_t)bo=
ol_input, 0),
> +                               vgetq_lane_p64(mask, 0));
> +    poly64_t prodH =3D vmull_high_p64((poly64x2_t)bool_input, mask);
> +    uint8x8_t res =3D vtrn2_u8((uint8x8_t)prodL, (uint8x8_t)prodH);
> +    return vget_lane_u16((uint16x4_t)res, 3);
> +}
> +