* [PATCH] by_pieces: Properly set m_max_size in op_by_pieces
@ 2021-08-03 13:56 H.J. Lu
2021-08-03 21:22 ` H.J. Lu
2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu
0 siblings, 2 replies; 7+ messages in thread
From: H.J. Lu @ 2021-08-03 13:56 UTC (permalink / raw)
To: gcc-patches; +Cc: Uros Bizjak, Richard Sandiford
1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
move is enabled since x86 uses vec_duplicate, which is enabled only when
inter-unit move is enabled, to implement store_by_pieces.
2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
compare_by_pieces.
gcc/
PR target/101742
* expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
for compare_by_pieces.
* config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
gcc/testsuite/
PR target/101742
* gcc.target/i386/pr101742a.c: New test.
* gcc.target/i386/pr101742b.c: Likewise.
---
gcc/config/i386/i386.h | 20 +++++++++++---------
gcc/expr.c | 6 +++++-
gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++
4 files changed, 36 insertions(+), 10 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index bed9cd9da18..9b416abd5f4 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1783,15 +1783,17 @@ typedef struct ix86_args {
/* STORE_MAX_PIECES is the number of bytes at a time that we can
store efficiently. */
#define STORE_MAX_PIECES \
- ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
- ? 64 \
- : ((TARGET_AVX \
- && !TARGET_PREFER_AVX128 \
- && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
- ? 32 \
- : ((TARGET_SSE2 \
- && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
- ? 16 : UNITS_PER_WORD)))
+ (TARGET_INTER_UNIT_MOVES_TO_VEC \
+ ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+ ? 64 \
+ : ((TARGET_AVX \
+ && !TARGET_PREFER_AVX128 \
+ && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+ ? 32 \
+ : ((TARGET_SSE2 \
+ && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+ ? 16 : UNITS_PER_WORD))) \
+ : UNITS_PER_WORD)
/* If a memory-to-memory move would take MOVE_RATIO or more simple
move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcfdcd1..2964b38b9a5 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1131,7 +1131,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
bool qi_vector_mode)
: m_to (to, to_load, NULL, NULL),
m_from (from, from_load, from_cfn, from_cfn_data),
- m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+ m_len (len),
+ m_max_size (((!to_load && from == nullptr)
+ ? STORE_MAX_PIECES
+ : (from_cfn != nullptr
+ ? COMPARE_MAX_PIECES : MOVE_MAX_PIECES)) + 1),
m_push (push), m_qi_vector_mode (qi_vector_mode)
{
int toi = m_to.get_addr_inc ();
diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c
new file mode 100644
index 00000000000..67ea40587dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2" } */
+
+int n2;
+
+__attribute__ ((simd)) char
+w7 (void)
+{
+ short int xb = n2;
+ int qp;
+
+ for (qp = 0; qp < 2; ++qp)
+ xb = xb < 1;
+
+ return xb;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c
new file mode 100644
index 00000000000..ba19064077b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */
+
+#include "pr101742a.c"
--
2.31.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] by_pieces: Properly set m_max_size in op_by_pieces
2021-08-03 13:56 [PATCH] by_pieces: Properly set m_max_size in op_by_pieces H.J. Lu
@ 2021-08-03 21:22 ` H.J. Lu
2021-08-04 7:27 ` Richard Sandiford
2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu
1 sibling, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2021-08-03 21:22 UTC (permalink / raw)
To: GCC Patches; +Cc: Uros Bizjak, Richard Sandiford
[-- Attachment #1: Type: text/plain, Size: 4285 bytes --]
On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
> move is enabled since x86 uses vec_duplicate, which is enabled only when
> inter-unit move is enabled, to implement store_by_pieces.
> 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
> STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
> compare_by_pieces.
>
> gcc/
>
> PR target/101742
> * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
> STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
> for compare_by_pieces.
> * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
> only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
>
> gcc/testsuite/
>
> PR target/101742
> * gcc.target/i386/pr101742a.c: New test.
> * gcc.target/i386/pr101742b.c: Likewise.
> ---
> gcc/config/i386/i386.h | 20 +++++++++++---------
> gcc/expr.c | 6 +++++-
> gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++
> 4 files changed, 36 insertions(+), 10 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
>
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index bed9cd9da18..9b416abd5f4 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -1783,15 +1783,17 @@ typedef struct ix86_args {
> /* STORE_MAX_PIECES is the number of bytes at a time that we can
> store efficiently. */
> #define STORE_MAX_PIECES \
> - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> - ? 64 \
> - : ((TARGET_AVX \
> - && !TARGET_PREFER_AVX128 \
> - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> - ? 32 \
> - : ((TARGET_SSE2 \
> - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> - ? 16 : UNITS_PER_WORD)))
> + (TARGET_INTER_UNIT_MOVES_TO_VEC \
> + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> + ? 64 \
> + : ((TARGET_AVX \
> + && !TARGET_PREFER_AVX128 \
> + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> + ? 32 \
> + : ((TARGET_SSE2 \
> + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> + ? 16 : UNITS_PER_WORD))) \
> + : UNITS_PER_WORD)
>
> /* If a memory-to-memory move would take MOVE_RATIO or more simple
> move-instruction pairs, we will do a cpymem or libcall instead.
> diff --git a/gcc/expr.c b/gcc/expr.c
> index b65cfcfdcd1..2964b38b9a5 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -1131,7 +1131,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> bool qi_vector_mode)
> : m_to (to, to_load, NULL, NULL),
> m_from (from, from_load, from_cfn, from_cfn_data),
> - m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
> + m_len (len),
> + m_max_size (((!to_load && from == nullptr)
> + ? STORE_MAX_PIECES
> + : (from_cfn != nullptr
> + ? COMPARE_MAX_PIECES : MOVE_MAX_PIECES)) + 1),
> m_push (push), m_qi_vector_mode (qi_vector_mode)
> {
> int toi = m_to.get_addr_inc ();
This larger expr.c patch passes the proper MAX_PIECES directly.
> diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c
> new file mode 100644
> index 00000000000..67ea40587dd
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
> @@ -0,0 +1,16 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -mtune=nano-x2" } */
> +
> +int n2;
> +
> +__attribute__ ((simd)) char
> +w7 (void)
> +{
> + short int xb = n2;
> + int qp;
> +
> + for (qp = 0; qp < 2; ++qp)
> + xb = xb < 1;
> +
> + return xb;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c
> new file mode 100644
> index 00000000000..ba19064077b
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
> @@ -0,0 +1,4 @@
> +/* { dg-do compile } */
> +/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */
> +
> +#include "pr101742a.c"
> --
> 2.31.1
>
--
H.J.
[-- Attachment #2: p.patch --]
[-- Type: text/x-patch, Size: 2736 bytes --]
diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcfdcd1..66ac1986f02 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1110,8 +1110,8 @@ class op_by_pieces_d
}
public:
- op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
- unsigned HOST_WIDE_INT, unsigned int, bool,
+ op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn,
+ void *, unsigned HOST_WIDE_INT, unsigned int, bool,
bool = false);
void run ();
};
@@ -1122,8 +1122,8 @@ class op_by_pieces_d
and its associated FROM_CFN_DATA can be used to replace loads with
constant values. LEN describes the length of the operation. */
-op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
- rtx from, bool from_load,
+op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
+ bool to_load, rtx from, bool from_load,
by_pieces_constfn from_cfn,
void *from_cfn_data,
unsigned HOST_WIDE_INT len,
@@ -1131,7 +1131,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
bool qi_vector_mode)
: m_to (to, to_load, NULL, NULL),
m_from (from, from_load, from_cfn, from_cfn_data),
- m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+ m_len (len), m_max_size (max_pieces + 1),
m_push (push), m_qi_vector_mode (qi_vector_mode)
{
int toi = m_to.get_addr_inc ();
@@ -1324,8 +1324,8 @@ class move_by_pieces_d : public op_by_pieces_d
public:
move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
unsigned int align)
- : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align,
- PUSHG_P (to))
+ : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL,
+ NULL, len, align, PUSHG_P (to))
{
}
rtx finish_retmode (memop_ret);
@@ -1421,8 +1421,8 @@ class store_by_pieces_d : public op_by_pieces_d
store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
unsigned HOST_WIDE_INT len, unsigned int align,
bool qi_vector_mode)
- : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
- align, false, qi_vector_mode)
+ : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn,
+ cfn_data, len, align, false, qi_vector_mode)
{
}
rtx finish_retmode (memop_ret);
@@ -1618,8 +1618,8 @@ class compare_by_pieces_d : public op_by_pieces_d
compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
void *op1_cfn_data, HOST_WIDE_INT len, int align,
rtx_code_label *fail_label)
- : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len,
- align, false)
+ : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn,
+ op1_cfn_data, len, align, false)
{
m_fail_label = fail_label;
}
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] by_pieces: Properly set m_max_size in op_by_pieces
2021-08-03 21:22 ` H.J. Lu
@ 2021-08-04 7:27 ` Richard Sandiford
2021-08-04 12:52 ` [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d H.J. Lu
0 siblings, 1 reply; 7+ messages in thread
From: Richard Sandiford @ 2021-08-04 7:27 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> @@ -1122,8 +1122,8 @@ class op_by_pieces_d
> and its associated FROM_CFN_DATA can be used to replace loads with
> constant values. LEN describes the length of the operation. */
>
> -op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> - rtx from, bool from_load,
> +op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
> + bool to_load, rtx from, bool from_load,
> by_pieces_constfn from_cfn,
> void *from_cfn_data,
> unsigned HOST_WIDE_INT len,
The comment above the function needs to describe the new parameter.
OK with that change, thanks.
Richard
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d
2021-08-04 7:27 ` Richard Sandiford
@ 2021-08-04 12:52 ` H.J. Lu
0 siblings, 0 replies; 7+ messages in thread
From: H.J. Lu @ 2021-08-04 12:52 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches, Richard Sandiford
[-- Attachment #1: Type: text/plain, Size: 953 bytes --]
On Wed, Aug 4, 2021 at 12:27 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > @@ -1122,8 +1122,8 @@ class op_by_pieces_d
> > and its associated FROM_CFN_DATA can be used to replace loads with
> > constant values. LEN describes the length of the operation. */
> >
> > -op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> > - rtx from, bool from_load,
> > +op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
> > + bool to_load, rtx from, bool from_load,
> > by_pieces_constfn from_cfn,
> > void *from_cfn_data,
> > unsigned HOST_WIDE_INT len,
>
> The comment above the function needs to describe the new parameter.
>
> OK with that change, thanks.
>
This is the patch I am checking in.
Thanks.
---
H.J.
[-- Attachment #2: v2-0001-by_pieces-Pass-MAX_PIECES-to-op_by_pieces_d.patch --]
[-- Type: text/x-patch, Size: 3642 bytes --]
From 27343601ab064553eac695ed58e741c7b2f6059d Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 3 Aug 2021 06:17:22 -0700
Subject: [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d
Pass MAX_PIECES to op_by_pieces_d::op_by_pieces_d for move, store and
compare.
PR target/101742
* expr.c (op_by_pieces_d::op_by_pieces_d): Add a max_pieces
argument to set m_max_size.
(move_by_pieces_d): Pass MOVE_MAX_PIECES to op_by_pieces_d.
(store_by_pieces_d): Pass STORE_MAX_PIECES to op_by_pieces_d.
(compare_by_pieces_d): Pass COMPARE_MAX_PIECES to op_by_pieces_d.
diff --git a/gcc/expr.c b/gcc/expr.c
index b65cfcfdcd1..096c0315ecc 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -1110,8 +1110,8 @@ class op_by_pieces_d
}
public:
- op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
- unsigned HOST_WIDE_INT, unsigned int, bool,
+ op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn,
+ void *, unsigned HOST_WIDE_INT, unsigned int, bool,
bool = false);
void run ();
};
@@ -1120,10 +1120,12 @@ class op_by_pieces_d
objects named TO and FROM, which are identified as loads or stores
by TO_LOAD and FROM_LOAD. If FROM is a load, the optional FROM_CFN
and its associated FROM_CFN_DATA can be used to replace loads with
- constant values. LEN describes the length of the operation. */
+ constant values. MAX_PIECES describes the maximum number of bytes
+ at a time which can be moved efficiently. LEN describes the length
+ of the operation. */
-op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
- rtx from, bool from_load,
+op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to,
+ bool to_load, rtx from, bool from_load,
by_pieces_constfn from_cfn,
void *from_cfn_data,
unsigned HOST_WIDE_INT len,
@@ -1131,7 +1133,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
bool qi_vector_mode)
: m_to (to, to_load, NULL, NULL),
m_from (from, from_load, from_cfn, from_cfn_data),
- m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
+ m_len (len), m_max_size (max_pieces + 1),
m_push (push), m_qi_vector_mode (qi_vector_mode)
{
int toi = m_to.get_addr_inc ();
@@ -1324,8 +1326,8 @@ class move_by_pieces_d : public op_by_pieces_d
public:
move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len,
unsigned int align)
- : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align,
- PUSHG_P (to))
+ : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL,
+ NULL, len, align, PUSHG_P (to))
{
}
rtx finish_retmode (memop_ret);
@@ -1421,8 +1423,8 @@ class store_by_pieces_d : public op_by_pieces_d
store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
unsigned HOST_WIDE_INT len, unsigned int align,
bool qi_vector_mode)
- : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
- align, false, qi_vector_mode)
+ : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn,
+ cfn_data, len, align, false, qi_vector_mode)
{
}
rtx finish_retmode (memop_ret);
@@ -1618,8 +1620,8 @@ class compare_by_pieces_d : public op_by_pieces_d
compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn,
void *op1_cfn_data, HOST_WIDE_INT len, int align,
rtx_code_label *fail_label)
- : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len,
- align, false)
+ : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn,
+ op1_cfn_data, len, align, false)
{
m_fail_label = fail_label;
}
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v2] x86: Update STORE_MAX_PIECES
2021-08-03 13:56 [PATCH] by_pieces: Properly set m_max_size in op_by_pieces H.J. Lu
2021-08-03 21:22 ` H.J. Lu
@ 2021-08-04 13:33 ` H.J. Lu
2021-08-04 18:46 ` Uros Bizjak
1 sibling, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2021-08-04 13:33 UTC (permalink / raw)
To: GCC Patches
[-- Attachment #1: Type: text/plain, Size: 2666 bytes --]
On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
> move is enabled since x86 uses vec_duplicate, which is enabled only when
> inter-unit move is enabled, to implement store_by_pieces.
> 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
> STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
> compare_by_pieces.
>
> gcc/
>
> PR target/101742
> * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
> STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
> for compare_by_pieces.
> * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
> only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
>
> gcc/testsuite/
>
> PR target/101742
> * gcc.target/i386/pr101742a.c: New test.
> * gcc.target/i386/pr101742b.c: Likewise.
> ---
> gcc/config/i386/i386.h | 20 +++++++++++---------
> gcc/expr.c | 6 +++++-
> gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
> gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++
> 4 files changed, 36 insertions(+), 10 deletions(-)
> create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
> create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
>
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index bed9cd9da18..9b416abd5f4 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -1783,15 +1783,17 @@ typedef struct ix86_args {
> /* STORE_MAX_PIECES is the number of bytes at a time that we can
> store efficiently. */
> #define STORE_MAX_PIECES \
> - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> - ? 64 \
> - : ((TARGET_AVX \
> - && !TARGET_PREFER_AVX128 \
> - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> - ? 32 \
> - : ((TARGET_SSE2 \
> - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> - ? 16 : UNITS_PER_WORD)))
> + (TARGET_INTER_UNIT_MOVES_TO_VEC \
> + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> + ? 64 \
> + : ((TARGET_AVX \
> + && !TARGET_PREFER_AVX128 \
> + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> + ? 32 \
> + : ((TARGET_SSE2 \
> + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> + ? 16 : UNITS_PER_WORD))) \
> + : UNITS_PER_WORD)
>
> /* If a memory-to-memory move would take MOVE_RATIO or more simple
> move-instruction pairs, we will do a cpymem or libcall instead.
expr.c has been fixed. Here is the v2 patch for x86 backend.
OK for master?
Thanks.
--
H.J.
[-- Attachment #2: v2-0001-x86-Update-STORE_MAX_PIECES.patch --]
[-- Type: application/x-patch, Size: 2930 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] x86: Update STORE_MAX_PIECES
2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu
@ 2021-08-04 18:46 ` Uros Bizjak
2021-08-04 20:01 ` [PATCH v3] " H.J. Lu
0 siblings, 1 reply; 7+ messages in thread
From: Uros Bizjak @ 2021-08-04 18:46 UTC (permalink / raw)
To: H.J. Lu; +Cc: GCC Patches, Hongtao Liu
On Wed, Aug 4, 2021 at 3:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
> > move is enabled since x86 uses vec_duplicate, which is enabled only when
> > inter-unit move is enabled, to implement store_by_pieces.
> > 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
> > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
> > compare_by_pieces.
> >
> > gcc/
> >
> > PR target/101742
> > * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
> > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
> > for compare_by_pieces.
> > * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
> > only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
> >
> > gcc/testsuite/
> >
> > PR target/101742
> > * gcc.target/i386/pr101742a.c: New test.
> > * gcc.target/i386/pr101742b.c: Likewise.
> > ---
> > gcc/config/i386/i386.h | 20 +++++++++++---------
> > gcc/expr.c | 6 +++++-
> > gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
> > gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++
> > 4 files changed, 36 insertions(+), 10 deletions(-)
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
> > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
> >
> > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > index bed9cd9da18..9b416abd5f4 100644
> > --- a/gcc/config/i386/i386.h
> > +++ b/gcc/config/i386/i386.h
> > @@ -1783,15 +1783,17 @@ typedef struct ix86_args {
> > /* STORE_MAX_PIECES is the number of bytes at a time that we can
> > store efficiently. */
> > #define STORE_MAX_PIECES \
> > - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > - ? 64 \
> > - : ((TARGET_AVX \
> > - && !TARGET_PREFER_AVX128 \
> > - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > - ? 32 \
> > - : ((TARGET_SSE2 \
> > - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > - ? 16 : UNITS_PER_WORD)))
> > + (TARGET_INTER_UNIT_MOVES_TO_VEC \
> > + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > + ? 64 \
> > + : ((TARGET_AVX \
> > + && !TARGET_PREFER_AVX128 \
> > + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > + ? 32 \
> > + : ((TARGET_SSE2 \
> > + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > + ? 16 : UNITS_PER_WORD))) \
> > + : UNITS_PER_WORD)
> >
> > /* If a memory-to-memory move would take MOVE_RATIO or more simple
> > move-instruction pairs, we will do a cpymem or libcall instead.
>
> expr.c has been fixed. Here is the v2 patch for x86 backend.
> OK for master?
OK, but please add the comment about vec_duplicate before the define
to explain the situation with TARGET_INTER_UNIT_MOVES_TO_VEC.
Thanks,
Uros.
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v3] x86: Update STORE_MAX_PIECES
2021-08-04 18:46 ` Uros Bizjak
@ 2021-08-04 20:01 ` H.J. Lu
0 siblings, 0 replies; 7+ messages in thread
From: H.J. Lu @ 2021-08-04 20:01 UTC (permalink / raw)
To: Uros Bizjak; +Cc: GCC Patches, Hongtao Liu
[-- Attachment #1: Type: text/plain, Size: 3526 bytes --]
On Wed, Aug 4, 2021 at 11:46 AM Uros Bizjak <ubizjak@gmail.com> wrote:
>
> On Wed, Aug 4, 2021 at 3:34 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit
> > > move is enabled since x86 uses vec_duplicate, which is enabled only when
> > > inter-unit move is enabled, to implement store_by_pieces.
> > > 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to
> > > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for
> > > compare_by_pieces.
> > >
> > > gcc/
> > >
> > > PR target/101742
> > > * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to
> > > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES
> > > for compare_by_pieces.
> > > * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode
> > > only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
> > >
> > > gcc/testsuite/
> > >
> > > PR target/101742
> > > * gcc.target/i386/pr101742a.c: New test.
> > > * gcc.target/i386/pr101742b.c: Likewise.
> > > ---
> > > gcc/config/i386/i386.h | 20 +++++++++++---------
> > > gcc/expr.c | 6 +++++-
> > > gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++
> > > gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++
> > > 4 files changed, 36 insertions(+), 10 deletions(-)
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
> > > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
> > >
> > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> > > index bed9cd9da18..9b416abd5f4 100644
> > > --- a/gcc/config/i386/i386.h
> > > +++ b/gcc/config/i386/i386.h
> > > @@ -1783,15 +1783,17 @@ typedef struct ix86_args {
> > > /* STORE_MAX_PIECES is the number of bytes at a time that we can
> > > store efficiently. */
> > > #define STORE_MAX_PIECES \
> > > - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > > - ? 64 \
> > > - : ((TARGET_AVX \
> > > - && !TARGET_PREFER_AVX128 \
> > > - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > > - ? 32 \
> > > - : ((TARGET_SSE2 \
> > > - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > > - ? 16 : UNITS_PER_WORD)))
> > > + (TARGET_INTER_UNIT_MOVES_TO_VEC \
> > > + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
> > > + ? 64 \
> > > + : ((TARGET_AVX \
> > > + && !TARGET_PREFER_AVX128 \
> > > + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
> > > + ? 32 \
> > > + : ((TARGET_SSE2 \
> > > + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
> > > + ? 16 : UNITS_PER_WORD))) \
> > > + : UNITS_PER_WORD)
> > >
> > > /* If a memory-to-memory move would take MOVE_RATIO or more simple
> > > move-instruction pairs, we will do a cpymem or libcall instead.
> >
> > expr.c has been fixed. Here is the v2 patch for x86 backend.
> > OK for master?
>
> OK, but please add the comment about vec_duplicate before the define
> to explain the situation with TARGET_INTER_UNIT_MOVES_TO_VEC.
This is what I am checking in with
/* STORE_MAX_PIECES is the number of bytes at a time that we can store
efficiently. Allow 16/32/64 bytes only if inter-unit move is enabled
since vec_duplicate enabled by inter-unit move is used to implement
store_by_pieces of 16/32/64 bytes. */
> Thanks,
> Uros.
Thanks.
--
H.J.
[-- Attachment #2: v3-0001-x86-Update-STORE_MAX_PIECES.patch --]
[-- Type: text/x-patch, Size: 3284 bytes --]
From 9487c165afb5b6083a3fc09a2e8b7bcabfe28765 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 3 Aug 2021 06:17:22 -0700
Subject: [PATCH v3] x86: Update STORE_MAX_PIECES
Update STORE_MAX_PIECES to allow 16/32/64 bytes only if inter-unit move
is enabled since vec_duplicate enabled by inter-unit move is used to
implement store_by_pieces of 16/32/64 bytes.
gcc/
PR target/101742
* config/i386/i386.h (STORE_MAX_PIECES): Allow 16/32/64 bytes
only if TARGET_INTER_UNIT_MOVES_TO_VEC is true.
gcc/testsuite/
PR target/101742
* gcc.target/i386/pr101742a.c: New test.
* gcc.target/i386/pr101742b.c: Likewise.
---
gcc/config/i386/i386.h | 26 +++++++++++++----------
gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++
gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++
3 files changed, 35 insertions(+), 11 deletions(-)
create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c
create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index bed9cd9da18..21fe51bba40 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -1780,18 +1780,22 @@ typedef struct ix86_args {
&& TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
? 16 : UNITS_PER_WORD)))
-/* STORE_MAX_PIECES is the number of bytes at a time that we can
- store efficiently. */
+/* STORE_MAX_PIECES is the number of bytes at a time that we can store
+ efficiently. Allow 16/32/64 bytes only if inter-unit move is enabled
+ since vec_duplicate enabled by inter-unit move is used to implement
+ store_by_pieces of 16/32/64 bytes. */
#define STORE_MAX_PIECES \
- ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
- ? 64 \
- : ((TARGET_AVX \
- && !TARGET_PREFER_AVX128 \
- && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
- ? 32 \
- : ((TARGET_SSE2 \
- && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
- ? 16 : UNITS_PER_WORD)))
+ (TARGET_INTER_UNIT_MOVES_TO_VEC \
+ ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \
+ ? 64 \
+ : ((TARGET_AVX \
+ && !TARGET_PREFER_AVX128 \
+ && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \
+ ? 32 \
+ : ((TARGET_SSE2 \
+ && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \
+ ? 16 : UNITS_PER_WORD))) \
+ : UNITS_PER_WORD)
/* If a memory-to-memory move would take MOVE_RATIO or more simple
move-instruction pairs, we will do a cpymem or libcall instead.
diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c
new file mode 100644
index 00000000000..67ea40587dd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742a.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2" } */
+
+int n2;
+
+__attribute__ ((simd)) char
+w7 (void)
+{
+ short int xb = n2;
+ int qp;
+
+ for (qp = 0; qp < 2; ++qp)
+ xb = xb < 1;
+
+ return xb;
+}
diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c
new file mode 100644
index 00000000000..ba19064077b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr101742b.c
@@ -0,0 +1,4 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */
+
+#include "pr101742a.c"
--
2.31.1
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2021-08-04 20:02 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-03 13:56 [PATCH] by_pieces: Properly set m_max_size in op_by_pieces H.J. Lu
2021-08-03 21:22 ` H.J. Lu
2021-08-04 7:27 ` Richard Sandiford
2021-08-04 12:52 ` [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d H.J. Lu
2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu
2021-08-04 18:46 ` Uros Bizjak
2021-08-04 20:01 ` [PATCH v3] " H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).