public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] aarch64: Don't include vec_select high-half in SIMD multiply cost
@ 2021-07-28 13:36 Jonathan Wright
  2021-08-04  9:05 ` Richard Sandiford
  0 siblings, 1 reply; 4+ messages in thread
From: Jonathan Wright @ 2021-07-28 13:36 UTC (permalink / raw)
  To: gcc-patches; +Cc: Richard Sandiford, Kyrylo Tkachov

[-- Attachment #1: Type: text/plain, Size: 1200 bytes --]

Hi,

The Neon multiply/multiply-accumulate/multiply-subtract instructions
can select the top or bottom half of the operand registers. This
selection does not change the cost of the underlying instruction and
this should be reflected by the RTL cost function.

This patch adds RTL tree traversal in the Neon multiply cost function
to match vec_select high-half of its operands. This traversal
prevents the cost of the vec_select from being added into the cost of
the multiply - meaning that these instructions can now be emitted in
the combine pass as they are no longer deemed prohibitively
expensive.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-19  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64.c (aarch64_vec_select_high_operand_p):
	Define.
	(aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
	vec_select high-half from being added into Neon multiply
	cost.
	* rtlanal.c (vec_series_highpart_p): Define.
	* rtlanal.h (vec_series_highpart_p): Declare.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vmul_high_cost.c: New test.

[-- Attachment #2: rb14704.patch --]
[-- Type: application/octet-stream, Size: 6947 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 5809887997305317c5a81421089db431685e2927..a49672afe785e3517250d324468edacceab5c9d3 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -76,6 +76,7 @@
 #include "function-abi.h"
 #include "gimple-pretty-print.h"
 #include "tree-ssa-loop-niter.h"
+#include "rtlanal.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -11970,6 +11971,19 @@ aarch64_cheap_mult_shift_p (rtx x)
   return false;
 }
 
+/* Return true iff X is an operand of a select-high-half vector
+   instruction.  */
+
+static bool
+aarch64_vec_select_high_operand_p (rtx x)
+{
+  return ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+	  && GET_CODE (XEXP (x, 0)) == VEC_SELECT
+	  && vec_series_highpart_p (GET_MODE (XEXP (x, 0)),
+				    GET_MODE (XEXP (XEXP (x, 0), 0)),
+				    XEXP (XEXP (x, 0), 1)));
+}
+
 /* Helper function for rtx cost calculation.  Calculate the cost of
    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
    Return the calculated cost of the expression, recursing manually in to
@@ -11995,6 +12009,13 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
       if (vec_flags & VEC_ADVSIMD)
 	{
+	  /* The select-operand-high-half versions of the instruction have the
+	     same cost as the three vector version - don't add the costs of the
+	     select into the costs of the multiply.  */
+	  if (aarch64_vec_select_high_operand_p (op0))
+	    op0 = XEXP (XEXP (op0, 0), 0);
+	  if (aarch64_vec_select_high_operand_p (op1))
+	    op1 = XEXP (XEXP (op1, 0), 0);
 	  /* The by-element versions of the instruction have the same costs as
 	     the normal 3-vector version.  So don't add the costs of the
 	     duplicate or subsequent select into the costs of the multiply.  We
diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h
index e1642424db89736675ac3e0d505aeaa59dca8bad..542dc7898bead27d3da89e5138c49563ba226eae 100644
--- a/gcc/rtlanal.h
+++ b/gcc/rtlanal.h
@@ -331,6 +331,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base ()
    collecting the references a second time.  */
 using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>;
 
+bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode,
+		       rtx sel);
+
 bool
 vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel);
 
diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
index ec7a062829cb4ead3eaedf1546956107f4ad3bb2..3db49e7a8237bef8ffd9aa4036bb2cfdb1cee6d5 100644
--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -6941,6 +6941,25 @@ register_asm_p (const_rtx x)
 	  && DECL_REGISTER (REG_EXPR (x)));
 }
 
+/* Return true if, for all OP of mode OP_MODE:
+
+     (vec_select:RESULT_MODE OP SEL)
+
+   is equivalent to the highpart RESULT_MODE of OP.  */
+
+bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)
+{
+  int nunits;
+  if (GET_MODE_NUNITS (op_mode).is_constant (&nunits)
+      && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS))
+    {
+      int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0);
+      return rtvec_series_p (XVEC (sel, 0), offset);
+    }
+  return false;
+}
+
 /* Return true if, for all OP of mode OP_MODE:
 
      (vec_select:RESULT_MODE OP SEL)
diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
new file mode 100644
index 0000000000000000000000000000000000000000..ecc02e652a4ba40e2fd68154ca8be5d322f43468
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
+					      vget_high_ ## ts (c)); \
+		rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
+					      vget_high_ ## ts (c)); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
+TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
+TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MULL_N(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
+		rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MLXL_VEC(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
+				    intype c) \
+	{ \
+		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
+					    vget_high_ ## ts (b)); \
+		return name ## _ ## ts (acc, vget_high_ ## ts (a), \
+					     vget_high_ ## ts (c)); \
+	}
+
+TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
+
+TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
+
+#define TEST_MLXL_N(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+	{ \
+		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+		return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+	}
+
+TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
+
+TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] aarch64: Don't include vec_select high-half in SIMD multiply cost
  2021-07-28 13:36 [PATCH] aarch64: Don't include vec_select high-half in SIMD multiply cost Jonathan Wright
@ 2021-08-04  9:05 ` Richard Sandiford
  2021-08-04 15:27   ` [PATCH V2] " Jonathan Wright
  0 siblings, 1 reply; 4+ messages in thread
From: Richard Sandiford @ 2021-08-04  9:05 UTC (permalink / raw)
  To: Jonathan Wright via Gcc-patches

Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Hi,
>
> The Neon multiply/multiply-accumulate/multiply-subtract instructions
> can select the top or bottom half of the operand registers. This
> selection does not change the cost of the underlying instruction and
> this should be reflected by the RTL cost function.
>
> This patch adds RTL tree traversal in the Neon multiply cost function
> to match vec_select high-half of its operands. This traversal
> prevents the cost of the vec_select from being added into the cost of
> the multiply - meaning that these instructions can now be emitted in
> the combine pass as they are no longer deemed prohibitively
> expensive.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.

Like you say, the instructions can handle both the low and high halves.
Shouldn't we also check for the low part (as a SIGN/ZERO_EXTEND of
a subreg)?

> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-19  Jonathan Wright  <jonathan.wright@arm.com>
>
> 	* config/aarch64/aarch64.c (aarch64_vec_select_high_operand_p):
> 	Define.
> 	(aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
> 	vec_select high-half from being added into Neon multiply
> 	cost.
> 	* rtlanal.c (vec_series_highpart_p): Define.
> 	* rtlanal.h (vec_series_highpart_p): Declare.
>
> gcc/testsuite/ChangeLog:
>
> 	* gcc.target/aarch64/vmul_high_cost.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 5809887997305317c5a81421089db431685e2927..a49672afe785e3517250d324468edacceab5c9d3 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -76,6 +76,7 @@
>  #include "function-abi.h"
>  #include "gimple-pretty-print.h"
>  #include "tree-ssa-loop-niter.h"
> +#include "rtlanal.h"
>  
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -11970,6 +11971,19 @@ aarch64_cheap_mult_shift_p (rtx x)
>    return false;
>  }
>  
> +/* Return true iff X is an operand of a select-high-half vector
> +   instruction.  */
> +
> +static bool
> +aarch64_vec_select_high_operand_p (rtx x)
> +{
> +  return ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
> +	  && GET_CODE (XEXP (x, 0)) == VEC_SELECT
> +	  && vec_series_highpart_p (GET_MODE (XEXP (x, 0)),
> +				    GET_MODE (XEXP (XEXP (x, 0), 0)),
> +				    XEXP (XEXP (x, 0), 1)));
> +}
> +
>  /* Helper function for rtx cost calculation.  Calculate the cost of
>     a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
>     Return the calculated cost of the expression, recursing manually in to
> @@ -11995,6 +12009,13 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
>        unsigned int vec_flags = aarch64_classify_vector_mode (mode);
>        if (vec_flags & VEC_ADVSIMD)
>  	{
> +	  /* The select-operand-high-half versions of the instruction have the
> +	     same cost as the three vector version - don't add the costs of the
> +	     select into the costs of the multiply.  */
> +	  if (aarch64_vec_select_high_operand_p (op0))
> +	    op0 = XEXP (XEXP (op0, 0), 0);
> +	  if (aarch64_vec_select_high_operand_p (op1))
> +	    op1 = XEXP (XEXP (op1, 0), 0);

For consistency with aarch64_strip_duplicate_vec_elt, I think this
should be something like aarch64_strip_vec_extension, returning
the inner rtx on success and the original one on failure.

Thanks,
Richard

>  	  /* The by-element versions of the instruction have the same costs as
>  	     the normal 3-vector version.  So don't add the costs of the
>  	     duplicate or subsequent select into the costs of the multiply.  We
> diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h
> index e1642424db89736675ac3e0d505aeaa59dca8bad..542dc7898bead27d3da89e5138c49563ba226eae 100644
> --- a/gcc/rtlanal.h
> +++ b/gcc/rtlanal.h
> @@ -331,6 +331,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base ()
>     collecting the references a second time.  */
>  using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>;
>  
> +bool
> +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode,
> +		       rtx sel);
> +
>  bool
>  vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel);
>  
> diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
> index ec7a062829cb4ead3eaedf1546956107f4ad3bb2..3db49e7a8237bef8ffd9aa4036bb2cfdb1cee6d5 100644
> --- a/gcc/rtlanal.c
> +++ b/gcc/rtlanal.c
> @@ -6941,6 +6941,25 @@ register_asm_p (const_rtx x)
>  	  && DECL_REGISTER (REG_EXPR (x)));
>  }
>  
> +/* Return true if, for all OP of mode OP_MODE:
> +
> +     (vec_select:RESULT_MODE OP SEL)
> +
> +   is equivalent to the highpart RESULT_MODE of OP.  */
> +
> +bool
> +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)
> +{
> +  int nunits;
> +  if (GET_MODE_NUNITS (op_mode).is_constant (&nunits)
> +      && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS))
> +    {
> +      int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0);
> +      return rtvec_series_p (XVEC (sel, 0), offset);
> +    }
> +  return false;
> +}
> +
>  /* Return true if, for all OP of mode OP_MODE:
>  
>       (vec_select:RESULT_MODE OP SEL)
> diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ecc02e652a4ba40e2fd68154ca8be5d322f43468
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
> @@ -0,0 +1,85 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include <arm_neon.h>
> +
> +#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
> +  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
> +	{ \
> +		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
> +					      vget_high_ ## ts (c)); \
> +		rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
> +					      vget_high_ ## ts (c)); \
> +		return vqaddq ## _ ## rs (t0, t1); \
> +	}
> +
> +TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
> +TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
> +TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
> +TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
> +TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
> +
> +TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
> +
> +#define TEST_MULL_N(name, rettype, intype, ts, rs) \
> +  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
> +	{ \
> +		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
> +		rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
> +		return vqaddq ## _ ## rs (t0, t1); \
> +	}
> +
> +TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
> +TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
> +TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
> +
> +TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
> +
> +#define TEST_MLXL_VEC(name, rettype, intype, ts) \
> +  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
> +				    intype c) \
> +	{ \
> +		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
> +					    vget_high_ ## ts (b)); \
> +		return name ## _ ## ts (acc, vget_high_ ## ts (a), \
> +					     vget_high_ ## ts (c)); \
> +	}
> +
> +TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
> +TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
> +TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
> +
> +TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
> +TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
> +TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
> +
> +#define TEST_MLXL_N(name, rettype, intype, ts) \
> +  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
> +	{ \
> +		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
> +		return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
> +	}
> +
> +TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
> +TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
> +TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
> +
> +TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
> +TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
> +TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
> +
> +TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
> +
> +TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
> +
> +/* { dg-final { scan-assembler-not "dup\\t" } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [PATCH V2] aarch64: Don't include vec_select high-half in SIMD multiply cost
  2021-08-04  9:05 ` Richard Sandiford
@ 2021-08-04 15:27   ` Jonathan Wright
  2021-08-04 15:41     ` Richard Sandiford
  0 siblings, 1 reply; 4+ messages in thread
From: Jonathan Wright @ 2021-08-04 15:27 UTC (permalink / raw)
  To: Richard Sandiford, Jonathan Wright via Gcc-patches

[-- Attachment #1: Type: text/plain, Size: 10632 bytes --]

Hi,

Changes suggested here and those discussed off-list have been
implemented in V2 of the patch.

Regression tested and bootstrapped on aarch64-none-linux-gnu - no
issues.

Ok for master?

Thanks,
Jonathan

---

gcc/ChangeLog:

2021-07-19  Jonathan Wright  <jonathan.wright@arm.com>

	* config/aarch64/aarch64.c (aarch64_strip_extend_vec_half):
	Define.
	(aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
	vec_select high-half from being added into Neon multiply
	cost.
	* rtlanal.c (vec_series_highpart_p): Define.
	* rtlanal.h (vec_series_highpart_p): Declare.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vmul_high_cost.c: New test.

From: Richard Sandiford <richard.sandiford@arm.com>
Sent: 04 August 2021 10:05
To: Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org>
Cc: Jonathan Wright <Jonathan.Wright@arm.com>
Subject: Re: [PATCH] aarch64: Don't include vec_select high-half in SIMD multiply cost 
 
Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> Hi,
>
> The Neon multiply/multiply-accumulate/multiply-subtract instructions
> can select the top or bottom half of the operand registers. This
> selection does not change the cost of the underlying instruction and
> this should be reflected by the RTL cost function.
>
> This patch adds RTL tree traversal in the Neon multiply cost function
> to match vec_select high-half of its operands. This traversal
> prevents the cost of the vec_select from being added into the cost of
> the multiply - meaning that these instructions can now be emitted in
> the combine pass as they are no longer deemed prohibitively
> expensive.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.

Like you say, the instructions can handle both the low and high halves.
Shouldn't we also check for the low part (as a SIGN/ZERO_EXTEND of
a subreg)?

> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-19  Jonathan Wright  <jonathan.wright@arm.com>
>
>        * config/aarch64/aarch64.c (aarch64_vec_select_high_operand_p):
>        Define.
>        (aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
>        vec_select high-half from being added into Neon multiply
>        cost.
>        * rtlanal.c (vec_series_highpart_p): Define.
>        * rtlanal.h (vec_series_highpart_p): Declare.
>
> gcc/testsuite/ChangeLog:
>
>        * gcc.target/aarch64/vmul_high_cost.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 5809887997305317c5a81421089db431685e2927..a49672afe785e3517250d324468edacceab5c9d3 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -76,6 +76,7 @@
>  #include "function-abi.h"
>  #include "gimple-pretty-print.h"
>  #include "tree-ssa-loop-niter.h"
> +#include "rtlanal.h"
>  
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -11970,6 +11971,19 @@ aarch64_cheap_mult_shift_p (rtx x)
>    return false;
>  }
>  
> +/* Return true iff X is an operand of a select-high-half vector
> +   instruction.  */
> +
> +static bool
> +aarch64_vec_select_high_operand_p (rtx x)
> +{
> +  return ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
> +       && GET_CODE (XEXP (x, 0)) == VEC_SELECT
> +       && vec_series_highpart_p (GET_MODE (XEXP (x, 0)),
> +                                 GET_MODE (XEXP (XEXP (x, 0), 0)),
> +                                 XEXP (XEXP (x, 0), 1)));
> +}
> +
>  /* Helper function for rtx cost calculation.  Calculate the cost of
>     a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
>     Return the calculated cost of the expression, recursing manually in to
> @@ -11995,6 +12009,13 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
>        unsigned int vec_flags = aarch64_classify_vector_mode (mode);
>        if (vec_flags & VEC_ADVSIMD)
>        {
> +       /* The select-operand-high-half versions of the instruction have the
> +          same cost as the three vector version - don't add the costs of the
> +          select into the costs of the multiply.  */
> +       if (aarch64_vec_select_high_operand_p (op0))
> +         op0 = XEXP (XEXP (op0, 0), 0);
> +       if (aarch64_vec_select_high_operand_p (op1))
> +         op1 = XEXP (XEXP (op1, 0), 0);

For consistency with aarch64_strip_duplicate_vec_elt, I think this
should be something like aarch64_strip_vec_extension, returning
the inner rtx on success and the original one on failure.

Thanks,
Richard

>          /* The by-element versions of the instruction have the same costs as
>             the normal 3-vector version.  So don't add the costs of the
>             duplicate or subsequent select into the costs of the multiply.  We
> diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h
> index e1642424db89736675ac3e0d505aeaa59dca8bad..542dc7898bead27d3da89e5138c49563ba226eae 100644
> --- a/gcc/rtlanal.h
> +++ b/gcc/rtlanal.h
> @@ -331,6 +331,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base ()
>     collecting the references a second time.  */
>  using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>;
>  
> +bool
> +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode,
> +                    rtx sel);
> +
>  bool
>  vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel);
>  
> diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
> index ec7a062829cb4ead3eaedf1546956107f4ad3bb2..3db49e7a8237bef8ffd9aa4036bb2cfdb1cee6d5 100644
> --- a/gcc/rtlanal.c
> +++ b/gcc/rtlanal.c
> @@ -6941,6 +6941,25 @@ register_asm_p (const_rtx x)
>          && DECL_REGISTER (REG_EXPR (x)));
>  }
>  
> +/* Return true if, for all OP of mode OP_MODE:
> +
> +     (vec_select:RESULT_MODE OP SEL)
> +
> +   is equivalent to the highpart RESULT_MODE of OP.  */
> +
> +bool
> +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)
> +{
> +  int nunits;
> +  if (GET_MODE_NUNITS (op_mode).is_constant (&nunits)
> +      && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS))
> +    {
> +      int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0);
> +      return rtvec_series_p (XVEC (sel, 0), offset);
> +    }
> +  return false;
> +}
> +
>  /* Return true if, for all OP of mode OP_MODE:
>  
>       (vec_select:RESULT_MODE OP SEL)
> diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ecc02e652a4ba40e2fd68154ca8be5d322f43468
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
> @@ -0,0 +1,85 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include <arm_neon.h>
> +
> +#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
> +  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
> +     { \
> +             rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
> +                                           vget_high_ ## ts (c)); \
> +             rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
> +                                           vget_high_ ## ts (c)); \
> +             return vqaddq ## _ ## rs (t0, t1); \
> +     }
> +
> +TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
> +TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
> +TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
> +TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
> +TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
> +
> +TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
> +
> +#define TEST_MULL_N(name, rettype, intype, ts, rs) \
> +  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
> +     { \
> +             rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
> +             rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
> +             return vqaddq ## _ ## rs (t0, t1); \
> +     }
> +
> +TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
> +TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
> +TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
> +
> +TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
> +
> +#define TEST_MLXL_VEC(name, rettype, intype, ts) \
> +  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
> +                                 intype c) \
> +     { \
> +             acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
> +                                         vget_high_ ## ts (b)); \
> +             return name ## _ ## ts (acc, vget_high_ ## ts (a), \
> +                                          vget_high_ ## ts (c)); \
> +     }
> +
> +TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
> +TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
> +TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
> +
> +TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
> +TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
> +TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
> +
> +#define TEST_MLXL_N(name, rettype, intype, ts) \
> +  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
> +     { \
> +             acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
> +             return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
> +     }
> +
> +TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
> +TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
> +TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
> +
> +TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
> +TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
> +TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
> +
> +TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
> +
> +TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
> +
> +/* { dg-final { scan-assembler-not "dup\\t" } } */

[-- Attachment #2: rb14704.patch --]
[-- Type: application/octet-stream, Size: 6977 bytes --]

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0eae925be07722e2d80c06b1eeafb5611bacf331..10a436ad7e6fa6c5de706ee5abbdc6fb3d268076 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -78,6 +78,7 @@
 #include "gimple-pretty-print.h"
 #include "tree-ssa-loop-niter.h"
 #include "fractional-cost.h"
+#include "rtlanal.h"
 
 /* This file should be included last.  */
 #include "target-def.h"
@@ -12046,6 +12047,22 @@ aarch64_strip_extend (rtx x, bool strip_shift)
   return x;
 }
 
+/* Helper function for rtx cost calculation. Strip extension as well as any
+   inner VEC_SELECT high-half from X. Returns the inner vector operand if
+   successful, or the original expression on failure.  */
+static rtx
+aarch64_strip_extend_vec_half (rtx x)
+{
+  if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
+    {
+      x = XEXP (x, 0);
+      if (GET_CODE (x) == VEC_SELECT
+	  && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
+				    XEXP (x, 1)))
+	x = XEXP (x, 0);
+    }
+  return x;
+}
 
 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
@@ -12133,6 +12150,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
       if (vec_flags & VEC_ADVSIMD)
 	{
+	  /* The select-operand-high-half versions of the instruction have the
+	     same cost as the three vector version - don't add the costs of the
+	     extension or selection into the costs of the multiply.  */
+	  op0 = aarch64_strip_extend_vec_half (op0);
+	  op1 = aarch64_strip_extend_vec_half (op1);
 	  /* The by-element versions of the instruction have the same costs as
 	     the normal 3-vector version.  We make an assumption that the input
 	     to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h
index e1642424db89736675ac3e0d505aeaa59dca8bad..542dc7898bead27d3da89e5138c49563ba226eae 100644
--- a/gcc/rtlanal.h
+++ b/gcc/rtlanal.h
@@ -331,6 +331,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base ()
    collecting the references a second time.  */
 using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>;
 
+bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode,
+		       rtx sel);
+
 bool
 vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel);
 
diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
index f7f3acb75db86fa68829a1d3a472348430542f2e..d37f7789b2051fd59abddfda0c8a0fdccaa9f1dc 100644
--- a/gcc/rtlanal.c
+++ b/gcc/rtlanal.c
@@ -6953,6 +6953,25 @@ register_asm_p (const_rtx x)
 	  && DECL_REGISTER (REG_EXPR (x)));
 }
 
+/* Return true if, for all OP of mode OP_MODE:
+
+     (vec_select:RESULT_MODE OP SEL)
+
+   is equivalent to the highpart RESULT_MODE of OP.  */
+
+bool
+vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)
+{
+  int nunits;
+  if (GET_MODE_NUNITS (op_mode).is_constant (&nunits)
+      && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS))
+    {
+      int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0);
+      return rtvec_series_p (XVEC (sel, 0), offset);
+    }
+  return false;
+}
+
 /* Return true if, for all OP of mode OP_MODE:
 
      (vec_select:RESULT_MODE OP SEL)
diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
new file mode 100644
index 0000000000000000000000000000000000000000..ecc02e652a4ba40e2fd68154ca8be5d322f43468
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
@@ -0,0 +1,85 @@
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <arm_neon.h>
+
+#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
+					      vget_high_ ## ts (c)); \
+		rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
+					      vget_high_ ## ts (c)); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
+TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
+TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MULL_N(name, rettype, intype, ts, rs) \
+  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
+	{ \
+		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
+		rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
+		return vqaddq ## _ ## rs (t0, t1); \
+	}
+
+TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
+TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
+TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
+
+TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
+TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
+
+#define TEST_MLXL_VEC(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
+				    intype c) \
+	{ \
+		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
+					    vget_high_ ## ts (b)); \
+		return name ## _ ## ts (acc, vget_high_ ## ts (a), \
+					     vget_high_ ## ts (c)); \
+	}
+
+TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
+
+TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
+TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
+TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
+TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
+
+#define TEST_MLXL_N(name, rettype, intype, ts) \
+  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
+	{ \
+		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+		return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
+	}
+
+TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
+TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
+TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
+
+TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
+
+TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
+TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
+
+/* { dg-final { scan-assembler-not "dup\\t" } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH V2] aarch64: Don't include vec_select high-half in SIMD multiply cost
  2021-08-04 15:27   ` [PATCH V2] " Jonathan Wright
@ 2021-08-04 15:41     ` Richard Sandiford
  0 siblings, 0 replies; 4+ messages in thread
From: Richard Sandiford @ 2021-08-04 15:41 UTC (permalink / raw)
  To: Jonathan Wright; +Cc: Jonathan Wright via Gcc-patches

Jonathan Wright <Jonathan.Wright@arm.com> writes:
> Hi,
>
> Changes suggested here and those discussed off-list have been
> implemented in V2 of the patch.
>
> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
> issues.
>
> Ok for master?
>
> Thanks,
> Jonathan
>
> ---
>
> gcc/ChangeLog:
>
> 2021-07-19  Jonathan Wright  <jonathan.wright@arm.com>
>
>         * config/aarch64/aarch64.c (aarch64_strip_extend_vec_half):
>         Define.
>         (aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
>         vec_select high-half from being added into Neon multiply
>         cost.
>         * rtlanal.c (vec_series_highpart_p): Define.
>         * rtlanal.h (vec_series_highpart_p): Declare.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/vmul_high_cost.c: New test.

OK, thanks.

Richard

>
> From: Richard Sandiford <richard.sandiford@arm.com>
> Sent: 04 August 2021 10:05
> To: Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org>
> Cc: Jonathan Wright <Jonathan.Wright@arm.com>
> Subject: Re: [PATCH] aarch64: Don't include vec_select high-half in SIMD multiply cost
>
> Jonathan Wright via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
>> Hi,
>>
>> The Neon multiply/multiply-accumulate/multiply-subtract instructions
>> can select the top or bottom half of the operand registers. This
>> selection does not change the cost of the underlying instruction and
>> this should be reflected by the RTL cost function.
>>
>> This patch adds RTL tree traversal in the Neon multiply cost function
>> to match vec_select high-half of its operands. This traversal
>> prevents the cost of the vec_select from being added into the cost of
>> the multiply - meaning that these instructions can now be emitted in
>> the combine pass as they are no longer deemed prohibitively
>> expensive.
>>
>> Regression tested and bootstrapped on aarch64-none-linux-gnu - no
>> issues.
>
> Like you say, the instructions can handle both the low and high halves.
> Shouldn't we also check for the low part (as a SIGN/ZERO_EXTEND of
> a subreg)?
>
>> Ok for master?
>>
>> Thanks,
>> Jonathan
>>
>> ---
>>
>> gcc/ChangeLog:
>>
>> 2021-07-19  Jonathan Wright  <jonathan.wright@arm.com>
>>
>>        * config/aarch64/aarch64.c (aarch64_vec_select_high_operand_p):
>>        Define.
>>        (aarch64_rtx_mult_cost): Traverse RTL tree to prevent cost of
>>        vec_select high-half from being added into Neon multiply
>>        cost.
>>        * rtlanal.c (vec_series_highpart_p): Define.
>>        * rtlanal.h (vec_series_highpart_p): Declare.
>>
>> gcc/testsuite/ChangeLog:
>>
>>        * gcc.target/aarch64/vmul_high_cost.c: New test.
>>
>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
>> index 5809887997305317c5a81421089db431685e2927..a49672afe785e3517250d324468edacceab5c9d3 100644
>> --- a/gcc/config/aarch64/aarch64.c
>> +++ b/gcc/config/aarch64/aarch64.c
>> @@ -76,6 +76,7 @@
>>  #include "function-abi.h"
>>  #include "gimple-pretty-print.h"
>>  #include "tree-ssa-loop-niter.h"
>> +#include "rtlanal.h"
>>
>>  /* This file should be included last.  */
>>  #include "target-def.h"
>> @@ -11970,6 +11971,19 @@ aarch64_cheap_mult_shift_p (rtx x)
>>    return false;
>>  }
>>
>> +/* Return true iff X is an operand of a select-high-half vector
>> +   instruction.  */
>> +
>> +static bool
>> +aarch64_vec_select_high_operand_p (rtx x)
>> +{
>> +  return ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
>> +       && GET_CODE (XEXP (x, 0)) == VEC_SELECT
>> +       && vec_series_highpart_p (GET_MODE (XEXP (x, 0)),
>> +                                 GET_MODE (XEXP (XEXP (x, 0), 0)),
>> +                                 XEXP (XEXP (x, 0), 1)));
>> +}
>> +
>>  /* Helper function for rtx cost calculation.  Calculate the cost of
>>     a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
>>     Return the calculated cost of the expression, recursing manually in to
>> @@ -11995,6 +12009,13 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
>>        unsigned int vec_flags = aarch64_classify_vector_mode (mode);
>>        if (vec_flags & VEC_ADVSIMD)
>>        {
>> +       /* The select-operand-high-half versions of the instruction have the
>> +          same cost as the three vector version - don't add the costs of the
>> +          select into the costs of the multiply.  */
>> +       if (aarch64_vec_select_high_operand_p (op0))
>> +         op0 = XEXP (XEXP (op0, 0), 0);
>> +       if (aarch64_vec_select_high_operand_p (op1))
>> +         op1 = XEXP (XEXP (op1, 0), 0);
>
> For consistency with aarch64_strip_duplicate_vec_elt, I think this
> should be something like aarch64_strip_vec_extension, returning
> the inner rtx on success and the original one on failure.
>
> Thanks,
> Richard
>
>>          /* The by-element versions of the instruction have the same costs as
>>             the normal 3-vector version.  So don't add the costs of the
>>             duplicate or subsequent select into the costs of the multiply.  We
>> diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h
>> index e1642424db89736675ac3e0d505aeaa59dca8bad..542dc7898bead27d3da89e5138c49563ba226eae 100644
>> --- a/gcc/rtlanal.h
>> +++ b/gcc/rtlanal.h
>> @@ -331,6 +331,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base ()
>>     collecting the references a second time.  */
>>  using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>;
>>
>> +bool
>> +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode,
>> +                    rtx sel);
>> +
>>  bool
>>  vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel);
>>
>> diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
>> index ec7a062829cb4ead3eaedf1546956107f4ad3bb2..3db49e7a8237bef8ffd9aa4036bb2cfdb1cee6d5 100644
>> --- a/gcc/rtlanal.c
>> +++ b/gcc/rtlanal.c
>> @@ -6941,6 +6941,25 @@ register_asm_p (const_rtx x)
>>          && DECL_REGISTER (REG_EXPR (x)));
>>  }
>>
>> +/* Return true if, for all OP of mode OP_MODE:
>> +
>> +     (vec_select:RESULT_MODE OP SEL)
>> +
>> +   is equivalent to the highpart RESULT_MODE of OP.  */
>> +
>> +bool
>> +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)
>> +{
>> +  int nunits;
>> +  if (GET_MODE_NUNITS (op_mode).is_constant (&nunits)
>> +      && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS))
>> +    {
>> +      int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0);
>> +      return rtvec_series_p (XVEC (sel, 0), offset);
>> +    }
>> +  return false;
>> +}
>> +
>>  /* Return true if, for all OP of mode OP_MODE:
>>
>>       (vec_select:RESULT_MODE OP SEL)
>> diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..ecc02e652a4ba40e2fd68154ca8be5d322f43468
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
>> @@ -0,0 +1,85 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-O3" } */
>> +
>> +#include <arm_neon.h>
>> +
>> +#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
>> +  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
>> +     { \
>> +             rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
>> +                                           vget_high_ ## ts (c)); \
>> +             rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
>> +                                           vget_high_ ## ts (c)); \
>> +             return vqaddq ## _ ## rs (t0, t1); \
>> +     }
>> +
>> +TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
>> +TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
>> +TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
>> +TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
>> +TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
>> +TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
>> +
>> +TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
>> +TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
>> +
>> +#define TEST_MULL_N(name, rettype, intype, ts, rs) \
>> +  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
>> +     { \
>> +             rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
>> +             rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
>> +             return vqaddq ## _ ## rs (t0, t1); \
>> +     }
>> +
>> +TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
>> +TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
>> +TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
>> +TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
>> +
>> +TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
>> +TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
>> +
>> +#define TEST_MLXL_VEC(name, rettype, intype, ts) \
>> +  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
>> +                                 intype c) \
>> +     { \
>> +             acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
>> +                                         vget_high_ ## ts (b)); \
>> +             return name ## _ ## ts (acc, vget_high_ ## ts (a), \
>> +                                          vget_high_ ## ts (c)); \
>> +     }
>> +
>> +TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
>> +TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
>> +TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
>> +TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
>> +
>> +TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
>> +TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
>> +TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
>> +TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
>> +
>> +#define TEST_MLXL_N(name, rettype, intype, ts) \
>> +  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
>> +     { \
>> +             acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
>> +             return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
>> +     }
>> +
>> +TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
>> +TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
>> +TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
>> +TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
>> +
>> +TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
>> +TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
>> +TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
>> +TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
>> +
>> +TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
>> +TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
>> +
>> +TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
>> +TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
>> +
>> +/* { dg-final { scan-assembler-not "dup\\t" } } */
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 0eae925be07722e2d80c06b1eeafb5611bacf331..10a436ad7e6fa6c5de706ee5abbdc6fb3d268076 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -78,6 +78,7 @@
>  #include "gimple-pretty-print.h"
>  #include "tree-ssa-loop-niter.h"
>  #include "fractional-cost.h"
> +#include "rtlanal.h"
>  
>  /* This file should be included last.  */
>  #include "target-def.h"
> @@ -12046,6 +12047,22 @@ aarch64_strip_extend (rtx x, bool strip_shift)
>    return x;
>  }
>  
> +/* Helper function for rtx cost calculation. Strip extension as well as any
> +   inner VEC_SELECT high-half from X. Returns the inner vector operand if
> +   successful, or the original expression on failure.  */
> +static rtx
> +aarch64_strip_extend_vec_half (rtx x)
> +{
> +  if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
> +    {
> +      x = XEXP (x, 0);
> +      if (GET_CODE (x) == VEC_SELECT
> +	  && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
> +				    XEXP (x, 1)))
> +	x = XEXP (x, 0);
> +    }
> +  return x;
> +}
>  
>  /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
>     any subsequent extend and VEC_SELECT from X. Returns the inner scalar
> @@ -12133,6 +12150,11 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
>        unsigned int vec_flags = aarch64_classify_vector_mode (mode);
>        if (vec_flags & VEC_ADVSIMD)
>  	{
> +	  /* The select-operand-high-half versions of the instruction have the
> +	     same cost as the three vector version - don't add the costs of the
> +	     extension or selection into the costs of the multiply.  */
> +	  op0 = aarch64_strip_extend_vec_half (op0);
> +	  op1 = aarch64_strip_extend_vec_half (op1);
>  	  /* The by-element versions of the instruction have the same costs as
>  	     the normal 3-vector version.  We make an assumption that the input
>  	     to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
> diff --git a/gcc/rtlanal.h b/gcc/rtlanal.h
> index e1642424db89736675ac3e0d505aeaa59dca8bad..542dc7898bead27d3da89e5138c49563ba226eae 100644
> --- a/gcc/rtlanal.h
> +++ b/gcc/rtlanal.h
> @@ -331,6 +331,10 @@ inline vec_rtx_properties_base::~vec_rtx_properties_base ()
>     collecting the references a second time.  */
>  using vec_rtx_properties = growing_rtx_properties<vec_rtx_properties_base>;
>  
> +bool
> +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode,
> +		       rtx sel);
> +
>  bool
>  vec_series_lowpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel);
>  
> diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c
> index f7f3acb75db86fa68829a1d3a472348430542f2e..d37f7789b2051fd59abddfda0c8a0fdccaa9f1dc 100644
> --- a/gcc/rtlanal.c
> +++ b/gcc/rtlanal.c
> @@ -6953,6 +6953,25 @@ register_asm_p (const_rtx x)
>  	  && DECL_REGISTER (REG_EXPR (x)));
>  }
>  
> +/* Return true if, for all OP of mode OP_MODE:
> +
> +     (vec_select:RESULT_MODE OP SEL)
> +
> +   is equivalent to the highpart RESULT_MODE of OP.  */
> +
> +bool
> +vec_series_highpart_p (machine_mode result_mode, machine_mode op_mode, rtx sel)
> +{
> +  int nunits;
> +  if (GET_MODE_NUNITS (op_mode).is_constant (&nunits)
> +      && targetm.can_change_mode_class (op_mode, result_mode, ALL_REGS))
> +    {
> +      int offset = BYTES_BIG_ENDIAN ? 0 : nunits - XVECLEN (sel, 0);
> +      return rtvec_series_p (XVEC (sel, 0), offset);
> +    }
> +  return false;
> +}
> +
>  /* Return true if, for all OP of mode OP_MODE:
>  
>       (vec_select:RESULT_MODE OP SEL)
> diff --git a/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
> new file mode 100644
> index 0000000000000000000000000000000000000000..ecc02e652a4ba40e2fd68154ca8be5d322f43468
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/vmul_high_cost.c
> @@ -0,0 +1,85 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3" } */
> +
> +#include <arm_neon.h>
> +
> +#define TEST_MULL_VEC(name, rettype, intype, ts, rs) \
> +  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
> +	{ \
> +		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), \
> +					      vget_high_ ## ts (c)); \
> +		rettype t1 = name ## _ ## ts (vget_high_ ## ts (b), \
> +					      vget_high_ ## ts (c)); \
> +		return vqaddq ## _ ## rs (t0, t1); \
> +	}
> +
> +TEST_MULL_VEC (vmull, int16x8_t, int8x16_t, s8, s16)
> +TEST_MULL_VEC (vmull, uint16x8_t, uint8x16_t, u8, u16)
> +TEST_MULL_VEC (vmull, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_VEC (vmull, uint32x4_t, uint16x8_t, u16, u32)
> +TEST_MULL_VEC (vmull, int64x2_t, int32x4_t, s32, s64)
> +TEST_MULL_VEC (vmull, uint64x2_t, uint32x4_t, u32, u64)
> +
> +TEST_MULL_VEC (vqdmull, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_VEC (vqdmull, int64x2_t, int32x4_t, s32, s64)
> +
> +#define TEST_MULL_N(name, rettype, intype, ts, rs) \
> +  rettype test_ ## name ## _ ## ts (intype a, intype b, intype c) \
> +	{ \
> +		rettype t0 = name ## _ ## ts (vget_high_ ## ts (a), b[1]); \
> +		rettype t1 = name ## _ ## ts (vget_high_ ## ts (a), c[1]); \
> +		return vqaddq ## _ ## rs (t0, t1); \
> +	}
> +
> +TEST_MULL_N (vmull_n, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_N (vmull_n, uint32x4_t, uint16x8_t, u16, u32)
> +TEST_MULL_N (vmull_n, int64x2_t, int32x4_t, s32, s64)
> +TEST_MULL_N (vmull_n, uint64x2_t, uint32x4_t, u32, u64)
> +
> +TEST_MULL_N (vqdmull_n, int32x4_t, int16x8_t, s16, s32)
> +TEST_MULL_N (vqdmull_n, int64x2_t, int32x4_t, s32, s64)
> +
> +#define TEST_MLXL_VEC(name, rettype, intype, ts) \
> +  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b, \
> +				    intype c) \
> +	{ \
> +		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), \
> +					    vget_high_ ## ts (b)); \
> +		return name ## _ ## ts (acc, vget_high_ ## ts (a), \
> +					     vget_high_ ## ts (c)); \
> +	}
> +
> +TEST_MLXL_VEC (vmlal, int16x8_t, int8x16_t, s8)
> +TEST_MLXL_VEC (vmlal, uint16x8_t, uint8x16_t, u8)
> +TEST_MLXL_VEC (vmlal, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_VEC (vmlal, uint32x4_t, uint16x8_t, u16)
> +
> +TEST_MLXL_VEC (vmlsl, int16x8_t, int8x16_t, s8)
> +TEST_MLXL_VEC (vmlsl, uint16x8_t, uint8x16_t, u8)
> +TEST_MLXL_VEC (vmlsl, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_VEC (vmlsl, uint32x4_t, uint16x8_t, u16)
> +
> +#define TEST_MLXL_N(name, rettype, intype, ts) \
> +  rettype test_ ## name ## _ ## ts (rettype acc, intype a, intype b) \
> +	{ \
> +		acc = name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
> +		return name ## _ ## ts (acc, vget_high_ ## ts (a), b[1]); \
> +	}
> +
> +TEST_MLXL_N (vmlal_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vmlal_n, uint32x4_t, uint16x8_t, u16)
> +TEST_MLXL_N (vmlal_n, int64x2_t, int32x4_t, s32)
> +TEST_MLXL_N (vmlal_n, uint64x2_t, uint32x4_t, u32)
> +
> +TEST_MLXL_N (vmlsl_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vmlsl_n, uint32x4_t, uint16x8_t, u16)
> +TEST_MLXL_N (vmlsl_n, int64x2_t, int32x4_t, s32)
> +TEST_MLXL_N (vmlsl_n, uint64x2_t, uint32x4_t, u32)
> +
> +TEST_MLXL_N (vqdmlal_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vqdmlal_n, int64x2_t, int32x4_t, s32)
> +
> +TEST_MLXL_N (vqdmlsl_n, int32x4_t, int16x8_t, s16)
> +TEST_MLXL_N (vqdmlsl_n, int64x2_t, int32x4_t, s32)
> +
> +/* { dg-final { scan-assembler-not "dup\\t" } } */

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2021-08-04 15:41 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-28 13:36 [PATCH] aarch64: Don't include vec_select high-half in SIMD multiply cost Jonathan Wright
2021-08-04  9:05 ` Richard Sandiford
2021-08-04 15:27   ` [PATCH V2] " Jonathan Wright
2021-08-04 15:41     ` Richard Sandiford

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).