public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] by_pieces: Properly set m_max_size in op_by_pieces
@ 2021-08-03 13:56 H.J. Lu
  2021-08-03 21:22 ` H.J. Lu
  2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu
  0 siblings, 2 replies; 7+ messages in thread
From: H.J. Lu @ 2021-08-03 13:56 UTC (permalink / raw)
  To: gcc-patches; +Cc: Uros Bizjak, Richard Sandiford

1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
move is enabled since x86 uses vec_duplicate, which is enabled only when
inter-unit move is enabled, to implement store_by_pieces.
2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
compare_by_pieces.

gcc/

	PR target/101742
	* expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
	STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
	for compare_by_pieces.
	* config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
	only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.

gcc/testsuite/

	PR target/101742
	* gcc.target/i386/pr101742a.c: New test.
	* gcc.target/i386/pr101742b.c: Likewise.
---
 gcc/config/i386/i386.h                    | 20 +++++++++++---------
 gcc/expr.c                                |  6 +++++-
 gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101742b.c |  4 ++++
 4 files changed, 36 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index bed9cd9da18..9b416abd5f4 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1783,15 +1783,17 @@ typedef struct ix86_args {
 /* STORE_MAX_PIECES is the number of bytes at a time that we can
    store efficiently.  */
 #define STORE_MAX_PIECES \
-  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
-   ? 64 \
-   : ((TARGET_AVX \
-       && !TARGET_PREFER_AVX128 \
-       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
-      ? 32 \
-      : ((TARGET_SSE2 \
-	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
-	 ? 16 : UNITS_PER_WORD)))
+  (TARGET_INTER_UNIT_MOVES_TO_VEC \
+   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+      ? 64 \
+      : ((TARGET_AVX \
+	  && !TARGET_PREFER_AVX128 \
+	  && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+	  ? 32 \
+	  : ((TARGET_SSE2 \
+	      && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+	      ? 16 : UNITS_PER_WORD))) \
+   : UNITS_PER_WORD)
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
    move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcfdcd1..2964b38b9a5 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1131,7 +1131,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
-    m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+    m_len (len),
+    m_max_size (((!to_load && from == nullptr)
+		 ? STORE_MAX_PIECES
+		 : (from_cfn != nullptr
+		    ? COMPARE_MAX_PIECES : MOVE_MAX_PIECES)) + 1),
     m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c
new file mode 100644
index 00000000000..67ea40587dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2" } */
+
+int n2;
+
+__attribute__ ((simd)) char
+w7 (void)
+{
+  short int xb = n2;
+  int qp;
+
+  for (qp = 0; qp < 2; ++qp)
+    xb = xb < 1;
+
+  return xb;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c
new file mode 100644
index 00000000000..ba19064077b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */
+
+#include "pr101742a.c"
-- 
2.31.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] by_pieces: Properly set m_max_size in op_by_pieces
  2021-08-03 13:56 [PATCH] by_pieces: Properly set m_max_size in op_by_pieces H.J. Lu
@ 2021-08-03 21:22 ` H.J. Lu
  2021-08-04  7:27   ` Richard Sandiford
  2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu
  1 sibling, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2021-08-03 21:22 UTC (permalink / raw)
  To: GCC Patches; +Cc: Uros Bizjak, Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 4285 bytes --]

On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
> move is enabled since x86 uses vec_duplicate, which is enabled only when
> inter-unit move is enabled, to implement store_by_pieces.
> 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
> STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
> compare_by_pieces.
>
> gcc/
>
>         PR target/101742
>         * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
>         STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
>         for compare_by_pieces.
>         * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
>         only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
>
> gcc/testsuite/
>
>         PR target/101742
>         * gcc.target/i386/pr101742a.c: New test.
>         * gcc.target/i386/pr101742b.c: Likewise.
> ---
>  gcc/config/i386/i386.h                    | 20 +++++++++++---------
>  gcc/expr.c                                |  6 +++++-
>  gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr101742b.c |  4 ++++
>  4 files changed, 36 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
>
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index bed9cd9da18..9b416abd5f4 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -1783,15 +1783,17 @@ typedef struct ix86_args {
>  /* STORE_MAX_PIECES is the number of bytes at a time that we can
>     store efficiently.  */
>  #define STORE_MAX_PIECES \
> -  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> -   ? 64 \
> -   : ((TARGET_AVX \
> -       && !TARGET_PREFER_AVX128 \
> -       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> -      ? 32 \
> -      : ((TARGET_SSE2 \
> -         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> -        ? 16 : UNITS_PER_WORD)))
> +  (TARGET_INTER_UNIT_MOVES_TO_VEC \
> +   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> +      ? 64 \
> +      : ((TARGET_AVX \
> +         && !TARGET_PREFER_AVX128 \
> +         && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> +         ? 32 \
> +         : ((TARGET_SSE2 \
> +             && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> +             ? 16 : UNITS_PER_WORD))) \
> +   : UNITS_PER_WORD)
>
>  /* If a memory-to-memory move would take MOVE_RATIO or more simple
>     move-instruction pairs, we will do a cpymem or libcall instead.
> diff --git a/gcc/expr.c b/gcc/expr.c
> index b65cfcfdcd1..2964b38b9a5 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -1131,7 +1131,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
>                                 bool qi_vector_mode)
>    : m_to (to, to_load, NULL, NULL),
>      m_from (from, from_load, from_cfn, from_cfn_data),
> -    m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
> +    m_len (len),
> +    m_max_size (((!to_load && from == nullptr)
> +                ? STORE_MAX_PIECES
> +                : (from_cfn != nullptr
> +                   ? COMPARE_MAX_PIECES : MOVE_MAX_PIECES)) + 1),
>      m_push (push), m_qi_vector_mode (qi_vector_mode)
>  {
>    int toi = m_to.get_addr_inc ();

This larger expr.c patch passes the proper MAX_PIECES directly.

> diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c
> new file mode 100644
> index 00000000000..67ea40587dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -mtune=nano-x2" } */
> +
> +int n2;
> +
> +__attribute__ ((simd)) char
> +w7 (void)
> +{
> +  short int xb = n2;
> +  int qp;
> +
> +  for (qp = 0; qp < 2; ++qp)
> +    xb = xb < 1;
> +
> +  return xb;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c
> new file mode 100644
> index 00000000000..ba19064077b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
> @@ -0,0 +1,4 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */
> +
> +#include "pr101742a.c"
> --
> 2.31.1
>


-- 
H.J.

[-- Attachment #2: p.patch --]
[-- Type: text/x-patch, Size: 2736 bytes --]

diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcfdcd1..66ac1986f02 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1110,8 +1110,8 @@ class op_by_pieces_d
   }
 
  public:
-  op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int, bool,
+  op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn,
+		  void *, unsigned HOST_WIDE_INT, unsigned int, bool,
 		  bool = false);
   void run ();
 };
@@ -1122,8 +1122,8 @@ class op_by_pieces_d
    and its associated FROM_CFN_DATA can be used to replace loads with
    constant values.  LEN describes the length of the operation.  */
 
-op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
-				rtx from, bool from_load,
+op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
+				bool to_load, rtx from, bool from_load,
 				by_pieces_constfn from_cfn,
 				void *from_cfn_data,
 				unsigned HOST_WIDE_INT len,
@@ -1131,7 +1131,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
-    m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+    m_len (len), m_max_size (max_pieces + 1),
     m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
@@ -1324,8 +1324,8 @@ class move_by_pieces_d : public op_by_pieces_d
  public:
   move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
 		    unsigned int align)
-    : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align,
-		      PUSHG_P (to))
+    : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL,
+		      NULL, len, align, PUSHG_P (to))
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1421,8 +1421,8 @@ class store_by_pieces_d : public op_by_pieces_d
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
 		     unsigned HOST_WIDE_INT len, unsigned int align,
 		     bool qi_vector_mode)
-    : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
-		      align, false, qi_vector_mode)
+    : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn,
+		      cfn_data, len, align, false, qi_vector_mode)
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1618,8 +1618,8 @@ class compare_by_pieces_d : public op_by_pieces_d
   compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
 		       void *op1_cfn_data, HOST_WIDE_INT len, int align,
 		       rtx_code_label *fail_label)
-    : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len,
-		      align, false)
+    : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn,
+		      op1_cfn_data, len, align, false)
   {
     m_fail_label = fail_label;
   }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] by_pieces: Properly set m_max_size in op_by_pieces
  2021-08-03 21:22 ` H.J. Lu
@ 2021-08-04  7:27   ` Richard Sandiford
  2021-08-04 12:52     ` [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d H.J. Lu
  0 siblings, 1 reply; 7+ messages in thread
From: Richard Sandiford @ 2021-08-04  7:27 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches

"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> @@ -1122,8 +1122,8 @@ class op_by_pieces_d
>     and its associated FROM_CFN_DATA can be used to replace loads with
>     constant values.  LEN describes the length of the operation.  */
> 
> -op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> -				rtx from, bool from_load,
> +op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
> +				bool to_load, rtx from, bool from_load,
>  				by_pieces_constfn from_cfn,
>  				void *from_cfn_data,
>  				unsigned HOST_WIDE_INT len,

The comment above the function needs to describe the new parameter.

OK with that change, thanks.

Richard

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d
  2021-08-04  7:27   ` Richard Sandiford
@ 2021-08-04 12:52     ` H.J. Lu
  0 siblings, 0 replies; 7+ messages in thread
From: H.J. Lu @ 2021-08-04 12:52 UTC (permalink / raw)
  To: H.J. Lu via Gcc-patches, Richard Sandiford

[-- Attachment #1: Type: text/plain, Size: 953 bytes --]

On Wed, Aug 4, 2021 at 12:27 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > @@ -1122,8 +1122,8 @@ class op_by_pieces_d
> >     and its associated FROM_CFN_DATA can be used to replace loads with
> >     constant values.  LEN describes the length of the operation.  */
> >
> > -op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> > -                             rtx from, bool from_load,
> > +op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
> > +                             bool to_load, rtx from, bool from_load,
> >                               by_pieces_constfn from_cfn,
> >                               void *from_cfn_data,
> >                               unsigned HOST_WIDE_INT len,
>
> The comment above the function needs to describe the new parameter.
>
> OK with that change, thanks.
>

This is the patch I am checking in.

Thanks.

---
H.J.

[-- Attachment #2: v2-0001-by_pieces-Pass-MAX_PIECES-to-op_by_pieces_d.patch --]
[-- Type: text/x-patch, Size: 3642 bytes --]

From 27343601ab064553eac695ed58e741c7b2f6059d Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 3 Aug 2021 06:17:22 -0700
Subject: [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d

Pass MAX_PIECES to op_by_pieces_d::op_by_pieces_d for move, store and
compare.

	PR target/101742
	* expr.c (op_by_pieces_d::op_by_pieces_d): Add a max_pieces
	argument to set m_max_size.
	(move_by_pieces_d): Pass MOVE_MAX_PIECES to op_by_pieces_d.
	(store_by_pieces_d): Pass STORE_MAX_PIECES to op_by_pieces_d.
	(compare_by_pieces_d): Pass COMPARE_MAX_PIECES to op_by_pieces_d.

diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcfdcd1..096c0315ecc 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1110,8 +1110,8 @@ class op_by_pieces_d
   }
 
  public:
-  op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
-		  unsigned HOST_WIDE_INT, unsigned int, bool,
+  op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn,
+		  void *, unsigned HOST_WIDE_INT, unsigned int, bool,
 		  bool = false);
   void run ();
 };
@@ -1120,10 +1120,12 @@ class op_by_pieces_d
    objects named TO and FROM, which are identified as loads or stores
    by TO_LOAD and FROM_LOAD.  If FROM is a load, the optional FROM_CFN
    and its associated FROM_CFN_DATA can be used to replace loads with
-   constant values.  LEN describes the length of the operation.  */
+   constant values.  MAX_PIECES describes the maximum number of bytes
+   at a time which can be moved efficiently.  LEN describes the length
+   of the operation.  */
 
-op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
-				rtx from, bool from_load,
+op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
+				bool to_load, rtx from, bool from_load,
 				by_pieces_constfn from_cfn,
 				void *from_cfn_data,
 				unsigned HOST_WIDE_INT len,
@@ -1131,7 +1133,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
 				bool qi_vector_mode)
   : m_to (to, to_load, NULL, NULL),
     m_from (from, from_load, from_cfn, from_cfn_data),
-    m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+    m_len (len), m_max_size (max_pieces + 1),
     m_push (push), m_qi_vector_mode (qi_vector_mode)
 {
   int toi = m_to.get_addr_inc ();
@@ -1324,8 +1326,8 @@ class move_by_pieces_d : public op_by_pieces_d
  public:
   move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
 		    unsigned int align)
-    : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align,
-		      PUSHG_P (to))
+    : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL,
+		      NULL, len, align, PUSHG_P (to))
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1421,8 +1423,8 @@ class store_by_pieces_d : public op_by_pieces_d
   store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
 		     unsigned HOST_WIDE_INT len, unsigned int align,
 		     bool qi_vector_mode)
-    : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
-		      align, false, qi_vector_mode)
+    : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn,
+		      cfn_data, len, align, false, qi_vector_mode)
   {
   }
   rtx finish_retmode (memop_ret);
@@ -1618,8 +1620,8 @@ class compare_by_pieces_d : public op_by_pieces_d
   compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
 		       void *op1_cfn_data, HOST_WIDE_INT len, int align,
 		       rtx_code_label *fail_label)
-    : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len,
-		      align, false)
+    : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn,
+		      op1_cfn_data, len, align, false)
   {
     m_fail_label = fail_label;
   }

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v2] x86: Update STORE_MAX_PIECES
  2021-08-03 13:56 [PATCH] by_pieces: Properly set m_max_size in op_by_pieces H.J. Lu
  2021-08-03 21:22 ` H.J. Lu
@ 2021-08-04 13:33 ` H.J. Lu
  2021-08-04 18:46   ` Uros Bizjak
  1 sibling, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2021-08-04 13:33 UTC (permalink / raw)
  To: GCC Patches

[-- Attachment #1: Type: text/plain, Size: 2666 bytes --]

On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
> move is enabled since x86 uses vec_duplicate, which is enabled only when
> inter-unit move is enabled, to implement store_by_pieces.
> 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
> STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
> compare_by_pieces.
>
> gcc/
>
>         PR target/101742
>         * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
>         STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
>         for compare_by_pieces.
>         * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
>         only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
>
> gcc/testsuite/
>
>         PR target/101742
>         * gcc.target/i386/pr101742a.c: New test.
>         * gcc.target/i386/pr101742b.c: Likewise.
> ---
>  gcc/config/i386/i386.h                    | 20 +++++++++++---------
>  gcc/expr.c                                |  6 +++++-
>  gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
>  gcc/testsuite/gcc.target/i386/pr101742b.c |  4 ++++
>  4 files changed, 36 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
>
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index bed9cd9da18..9b416abd5f4 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -1783,15 +1783,17 @@ typedef struct ix86_args {
>  /* STORE_MAX_PIECES is the number of bytes at a time that we can
>     store efficiently.  */
>  #define STORE_MAX_PIECES \
> -  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> -   ? 64 \
> -   : ((TARGET_AVX \
> -       && !TARGET_PREFER_AVX128 \
> -       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> -      ? 32 \
> -      : ((TARGET_SSE2 \
> -         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> -        ? 16 : UNITS_PER_WORD)))
> +  (TARGET_INTER_UNIT_MOVES_TO_VEC \
> +   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> +      ? 64 \
> +      : ((TARGET_AVX \
> +         && !TARGET_PREFER_AVX128 \
> +         && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> +         ? 32 \
> +         : ((TARGET_SSE2 \
> +             && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> +             ? 16 : UNITS_PER_WORD))) \
> +   : UNITS_PER_WORD)
>
>  /* If a memory-to-memory move would take MOVE_RATIO or more simple
>     move-instruction pairs, we will do a cpymem or libcall instead.

expr.c has been fixed.   Here is the v2 patch for x86 backend.
OK for master?

Thanks.

-- 
H.J.

[-- Attachment #2: v2-0001-x86-Update-STORE_MAX_PIECES.patch --]
[-- Type: application/x-patch, Size: 2930 bytes --]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] x86: Update STORE_MAX_PIECES
  2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu
@ 2021-08-04 18:46   ` Uros Bizjak
  2021-08-04 20:01     ` [PATCH v3] " H.J. Lu
  0 siblings, 1 reply; 7+ messages in thread
From: Uros Bizjak @ 2021-08-04 18:46 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GCC Patches, Hongtao Liu

On Wed, Aug 4, 2021 at 3:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
> > move is enabled since x86 uses vec_duplicate, which is enabled only when
> > inter-unit move is enabled, to implement store_by_pieces.
> > 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
> > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
> > compare_by_pieces.
> >
> > gcc/
> >
> >         PR target/101742
> >         * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
> >         STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
> >         for compare_by_pieces.
> >         * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
> >         only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
> >
> > gcc/testsuite/
> >
> >         PR target/101742
> >         * gcc.target/i386/pr101742a.c: New test.
> >         * gcc.target/i386/pr101742b.c: Likewise.
> > ---
> >  gcc/config/i386/i386.h                    | 20 +++++++++++---------
> >  gcc/expr.c                                |  6 +++++-
> >  gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
> >  gcc/testsuite/gcc.target/i386/pr101742b.c |  4 ++++
> >  4 files changed, 36 insertions(+), 10 deletions(-)
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
> >
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index bed9cd9da18..9b416abd5f4 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -1783,15 +1783,17 @@ typedef struct ix86_args {
> >  /* STORE_MAX_PIECES is the number of bytes at a time that we can
> >     store efficiently.  */
> >  #define STORE_MAX_PIECES \
> > -  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > -   ? 64 \
> > -   : ((TARGET_AVX \
> > -       && !TARGET_PREFER_AVX128 \
> > -       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > -      ? 32 \
> > -      : ((TARGET_SSE2 \
> > -         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > -        ? 16 : UNITS_PER_WORD)))
> > +  (TARGET_INTER_UNIT_MOVES_TO_VEC \
> > +   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > +      ? 64 \
> > +      : ((TARGET_AVX \
> > +         && !TARGET_PREFER_AVX128 \
> > +         && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > +         ? 32 \
> > +         : ((TARGET_SSE2 \
> > +             && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > +             ? 16 : UNITS_PER_WORD))) \
> > +   : UNITS_PER_WORD)
> >
> >  /* If a memory-to-memory move would take MOVE_RATIO or more simple
> >     move-instruction pairs, we will do a cpymem or libcall instead.
>
> expr.c has been fixed.   Here is the v2 patch for x86 backend.
> OK for master?

OK, but please add the comment about vec_duplicate before the define
to explain the situation with TARGET_INTER_UNIT_MOVES_TO_VEC.

Thanks,
Uros.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH v3] x86: Update STORE_MAX_PIECES
  2021-08-04 18:46   ` Uros Bizjak
@ 2021-08-04 20:01     ` H.J. Lu
  0 siblings, 0 replies; 7+ messages in thread
From: H.J. Lu @ 2021-08-04 20:01 UTC (permalink / raw)
  To: Uros Bizjak; +Cc: GCC Patches, Hongtao Liu

[-- Attachment #1: Type: text/plain, Size: 3526 bytes --]

On Wed, Aug 4, 2021 at 11:46 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Aug 4, 2021 at 3:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
> > > move is enabled since x86 uses vec_duplicate, which is enabled only when
> > > inter-unit move is enabled, to implement store_by_pieces.
> > > 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
> > > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
> > > compare_by_pieces.
> > >
> > > gcc/
> > >
> > >         PR target/101742
> > >         * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
> > >         STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
> > >         for compare_by_pieces.
> > >         * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
> > >         only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
> > >
> > > gcc/testsuite/
> > >
> > >         PR target/101742
> > >         * gcc.target/i386/pr101742a.c: New test.
> > >         * gcc.target/i386/pr101742b.c: Likewise.
> > > ---
> > >  gcc/config/i386/i386.h                    | 20 +++++++++++---------
> > >  gcc/expr.c                                |  6 +++++-
> > >  gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
> > >  gcc/testsuite/gcc.target/i386/pr101742b.c |  4 ++++
> > >  4 files changed, 36 insertions(+), 10 deletions(-)
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
> > >
> > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > index bed9cd9da18..9b416abd5f4 100644
> > > --- a/gcc/config/i386/i386.h
> > > +++ b/gcc/config/i386/i386.h
> > > @@ -1783,15 +1783,17 @@ typedef struct ix86_args {
> > >  /* STORE_MAX_PIECES is the number of bytes at a time that we can
> > >     store efficiently.  */
> > >  #define STORE_MAX_PIECES \
> > > -  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > > -   ? 64 \
> > > -   : ((TARGET_AVX \
> > > -       && !TARGET_PREFER_AVX128 \
> > > -       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > > -      ? 32 \
> > > -      : ((TARGET_SSE2 \
> > > -         && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > > -        ? 16 : UNITS_PER_WORD)))
> > > +  (TARGET_INTER_UNIT_MOVES_TO_VEC \
> > > +   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > > +      ? 64 \
> > > +      : ((TARGET_AVX \
> > > +         && !TARGET_PREFER_AVX128 \
> > > +         && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > > +         ? 32 \
> > > +         : ((TARGET_SSE2 \
> > > +             && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > > +             ? 16 : UNITS_PER_WORD))) \
> > > +   : UNITS_PER_WORD)
> > >
> > >  /* If a memory-to-memory move would take MOVE_RATIO or more simple
> > >     move-instruction pairs, we will do a cpymem or libcall instead.
> >
> > expr.c has been fixed.   Here is the v2 patch for x86 backend.
> > OK for master?
>
> OK, but please add the comment about vec_duplicate before the define
> to explain the situation with TARGET_INTER_UNIT_MOVES_TO_VEC.

This is what I am checking in with

/* STORE_MAX_PIECES is the number of bytes at a time that we can store
   efficiently.  Allow 16/32/64 bytes only if inter-unit move is enabled
   since vec_duplicate enabled by inter-unit move is used to implement
   store_by_pieces of 16/32/64 bytes.  */

> Thanks,
> Uros.

Thanks.

-- 
H.J.

[-- Attachment #2: v3-0001-x86-Update-STORE_MAX_PIECES.patch --]
[-- Type: text/x-patch, Size: 3284 bytes --]

From 9487c165afb5b6083a3fc09a2e8b7bcabfe28765 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 3 Aug 2021 06:17:22 -0700
Subject: [PATCH v3] x86: Update STORE_MAX_PIECES

Update STORE_MAX_PIECES to allow 16/32/64 bytes only if inter-unit move
is enabled since vec_duplicate enabled by inter-unit move is used to
implement store_by_pieces of 16/32/64 bytes.

gcc/

	PR target/101742
	* config/i386/i386.h (STORE_MAX_PIECES): Allow 16/32/64 bytes
	only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.

gcc/testsuite/

	PR target/101742
	* gcc.target/i386/pr101742a.c: New test.
	* gcc.target/i386/pr101742b.c: Likewise.
---
 gcc/config/i386/i386.h                    | 26 +++++++++++++----------
 gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++
 gcc/testsuite/gcc.target/i386/pr101742b.c |  4 ++++
 3 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c

diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index bed9cd9da18..21fe51bba40 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1780,18 +1780,22 @@ typedef struct ix86_args {
 	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
 	 ? 16 : UNITS_PER_WORD)))
 
-/* STORE_MAX_PIECES is the number of bytes at a time that we can
-   store efficiently.  */
+/* STORE_MAX_PIECES is the number of bytes at a time that we can store
+   efficiently.  Allow 16/32/64 bytes only if inter-unit move is enabled
+   since vec_duplicate enabled by inter-unit move is used to implement
+   store_by_pieces of 16/32/64 bytes.  */
 #define STORE_MAX_PIECES \
-  ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
-   ? 64 \
-   : ((TARGET_AVX \
-       && !TARGET_PREFER_AVX128 \
-       && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
-      ? 32 \
-      : ((TARGET_SSE2 \
-	  && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
-	 ? 16 : UNITS_PER_WORD)))
+  (TARGET_INTER_UNIT_MOVES_TO_VEC \
+   ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+      ? 64 \
+      : ((TARGET_AVX \
+	  && !TARGET_PREFER_AVX128 \
+	  && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+	  ? 32 \
+	  : ((TARGET_SSE2 \
+	      && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+	      ? 16 : UNITS_PER_WORD))) \
+   : UNITS_PER_WORD)
 
 /* If a memory-to-memory move would take MOVE_RATIO or more simple
    move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c
new file mode 100644
index 00000000000..67ea40587dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2" } */
+
+int n2;
+
+__attribute__ ((simd)) char
+w7 (void)
+{
+  short int xb = n2;
+  int qp;
+
+  for (qp = 0; qp < 2; ++qp)
+    xb = xb < 1;
+
+  return xb;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c
new file mode 100644
index 00000000000..ba19064077b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */
+
+#include "pr101742a.c"
-- 
2.31.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-08-04 20:02 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-03 13:56 [PATCH] by_pieces: Properly set m_max_size in op_by_pieces H.J. Lu
2021-08-03 21:22 ` H.J. Lu
2021-08-04  7:27   ` Richard Sandiford
2021-08-04 12:52     ` [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d H.J. Lu
2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu
2021-08-04 18:46   ` Uros Bizjak
2021-08-04 20:01     ` [PATCH v3] " H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).