* [PATCH] by_pieces: Properly set m_max_size in op_by_pieces @ 2021-08-03 13:56 H.J. Lu 2021-08-03 21:22 ` H.J. Lu 2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu 0 siblings, 2 replies; 7+ messages in thread From: H.J. Lu @ 2021-08-03 13:56 UTC (permalink / raw) To: gcc-patches; +Cc: Uros Bizjak, Richard Sandiford 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit move is enabled since x86 uses vec_duplicate, which is enabled only when inter-unit move is enabled, to implement store_by_pieces. 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for compare_by_pieces. gcc/ PR target/101742 * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for compare_by_pieces. * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode only if TARGET_INTER_UNIT_MOVES_TO_VEC is true. gcc/testsuite/ PR target/101742 * gcc.target/i386/pr101742a.c: New test. * gcc.target/i386/pr101742b.c: Likewise. --- gcc/config/i386/i386.h | 20 +++++++++++--------- gcc/expr.c | 6 +++++- gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++ gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++ 4 files changed, 36 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index bed9cd9da18..9b416abd5f4 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1783,15 +1783,17 @@ typedef struct ix86_args { /* STORE_MAX_PIECES is the number of bytes at a time that we can store efficiently. */ #define STORE_MAX_PIECES \ - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ - ? 64 \ - : ((TARGET_AVX \ - && !TARGET_PREFER_AVX128 \ - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ - ? 32 \ - : ((TARGET_SSE2 \ - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ - ? 16 : UNITS_PER_WORD))) + (TARGET_INTER_UNIT_MOVES_TO_VEC \ + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ + ? 64 \ + : ((TARGET_AVX \ + && !TARGET_PREFER_AVX128 \ + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ + ? 32 \ + : ((TARGET_SSE2 \ + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ + ? 16 : UNITS_PER_WORD))) \ + : UNITS_PER_WORD) /* If a memory-to-memory move would take MOVE_RATIO or more simple move-instruction pairs, we will do a cpymem or libcall instead. diff --git a/gcc/expr.c b/gcc/expr.c index b65cfcfdcd1..2964b38b9a5 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -1131,7 +1131,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, bool qi_vector_mode) : m_to (to, to_load, NULL, NULL), m_from (from, from_load, from_cfn, from_cfn_data), - m_len (len), m_max_size (MOVE_MAX_PIECES + 1), + m_len (len), + m_max_size (((!to_load && from == nullptr) + ? STORE_MAX_PIECES + : (from_cfn != nullptr + ? COMPARE_MAX_PIECES : MOVE_MAX_PIECES)) + 1), m_push (push), m_qi_vector_mode (qi_vector_mode) { int toi = m_to.get_addr_inc (); diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c new file mode 100644 index 00000000000..67ea40587dd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101742a.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -mtune=nano-x2" } */ + +int n2; + +__attribute__ ((simd)) char +w7 (void) +{ + short int xb = n2; + int qp; + + for (qp = 0; qp < 2; ++qp) + xb = xb < 1; + + return xb; +} diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c new file mode 100644 index 00000000000..ba19064077b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101742b.c @@ -0,0 +1,4 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */ + +#include "pr101742a.c" -- 2.31.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] by_pieces: Properly set m_max_size in op_by_pieces 2021-08-03 13:56 [PATCH] by_pieces: Properly set m_max_size in op_by_pieces H.J. Lu @ 2021-08-03 21:22 ` H.J. Lu 2021-08-04 7:27 ` Richard Sandiford 2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu 1 sibling, 1 reply; 7+ messages in thread From: H.J. Lu @ 2021-08-03 21:22 UTC (permalink / raw) To: GCC Patches; +Cc: Uros Bizjak, Richard Sandiford [-- Attachment #1: Type: text/plain, Size: 4285 bytes --] On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit > move is enabled since x86 uses vec_duplicate, which is enabled only when > inter-unit move is enabled, to implement store_by_pieces. > 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for > compare_by_pieces. > > gcc/ > > PR target/101742 > * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES > for compare_by_pieces. > * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode > only if TARGET_INTER_UNIT_MOVES_TO_VEC is true. > > gcc/testsuite/ > > PR target/101742 > * gcc.target/i386/pr101742a.c: New test. > * gcc.target/i386/pr101742b.c: Likewise. > --- > gcc/config/i386/i386.h | 20 +++++++++++--------- > gcc/expr.c | 6 +++++- > gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++ > 4 files changed, 36 insertions(+), 10 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index bed9cd9da18..9b416abd5f4 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -1783,15 +1783,17 @@ typedef struct ix86_args { > /* STORE_MAX_PIECES is the number of bytes at a time that we can > store efficiently. */ > #define STORE_MAX_PIECES \ > - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > - ? 64 \ > - : ((TARGET_AVX \ > - && !TARGET_PREFER_AVX128 \ > - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > - ? 32 \ > - : ((TARGET_SSE2 \ > - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > - ? 16 : UNITS_PER_WORD))) > + (TARGET_INTER_UNIT_MOVES_TO_VEC \ > + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > + ? 64 \ > + : ((TARGET_AVX \ > + && !TARGET_PREFER_AVX128 \ > + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > + ? 32 \ > + : ((TARGET_SSE2 \ > + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > + ? 16 : UNITS_PER_WORD))) \ > + : UNITS_PER_WORD) > > /* If a memory-to-memory move would take MOVE_RATIO or more simple > move-instruction pairs, we will do a cpymem or libcall instead. > diff --git a/gcc/expr.c b/gcc/expr.c > index b65cfcfdcd1..2964b38b9a5 100644 > --- a/gcc/expr.c > +++ b/gcc/expr.c > @@ -1131,7 +1131,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, > bool qi_vector_mode) > : m_to (to, to_load, NULL, NULL), > m_from (from, from_load, from_cfn, from_cfn_data), > - m_len (len), m_max_size (MOVE_MAX_PIECES + 1), > + m_len (len), > + m_max_size (((!to_load && from == nullptr) > + ? STORE_MAX_PIECES > + : (from_cfn != nullptr > + ? COMPARE_MAX_PIECES : MOVE_MAX_PIECES)) + 1), > m_push (push), m_qi_vector_mode (qi_vector_mode) > { > int toi = m_to.get_addr_inc (); This larger expr.c patch passes the proper MAX_PIECES directly. > diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c > new file mode 100644 > index 00000000000..67ea40587dd > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr101742a.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -mtune=nano-x2" } */ > + > +int n2; > + > +__attribute__ ((simd)) char > +w7 (void) > +{ > + short int xb = n2; > + int qp; > + > + for (qp = 0; qp < 2; ++qp) > + xb = xb < 1; > + > + return xb; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c > new file mode 100644 > index 00000000000..ba19064077b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr101742b.c > @@ -0,0 +1,4 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */ > + > +#include "pr101742a.c" > -- > 2.31.1 > -- H.J. [-- Attachment #2: p.patch --] [-- Type: text/x-patch, Size: 2736 bytes --] diff --git a/gcc/expr.c b/gcc/expr.c index b65cfcfdcd1..66ac1986f02 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -1110,8 +1110,8 @@ class op_by_pieces_d } public: - op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *, - unsigned HOST_WIDE_INT, unsigned int, bool, + op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn, + void *, unsigned HOST_WIDE_INT, unsigned int, bool, bool = false); void run (); }; @@ -1122,8 +1122,8 @@ class op_by_pieces_d and its associated FROM_CFN_DATA can be used to replace loads with constant values. LEN describes the length of the operation. */ -op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, - rtx from, bool from_load, +op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to, + bool to_load, rtx from, bool from_load, by_pieces_constfn from_cfn, void *from_cfn_data, unsigned HOST_WIDE_INT len, @@ -1131,7 +1131,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, bool qi_vector_mode) : m_to (to, to_load, NULL, NULL), m_from (from, from_load, from_cfn, from_cfn_data), - m_len (len), m_max_size (MOVE_MAX_PIECES + 1), + m_len (len), m_max_size (max_pieces + 1), m_push (push), m_qi_vector_mode (qi_vector_mode) { int toi = m_to.get_addr_inc (); @@ -1324,8 +1324,8 @@ class move_by_pieces_d : public op_by_pieces_d public: move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len, unsigned int align) - : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align, - PUSHG_P (to)) + : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL, + NULL, len, align, PUSHG_P (to)) { } rtx finish_retmode (memop_ret); @@ -1421,8 +1421,8 @@ class store_by_pieces_d : public op_by_pieces_d store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data, unsigned HOST_WIDE_INT len, unsigned int align, bool qi_vector_mode) - : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, - align, false, qi_vector_mode) + : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn, + cfn_data, len, align, false, qi_vector_mode) { } rtx finish_retmode (memop_ret); @@ -1618,8 +1618,8 @@ class compare_by_pieces_d : public op_by_pieces_d compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn, void *op1_cfn_data, HOST_WIDE_INT len, int align, rtx_code_label *fail_label) - : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len, - align, false) + : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn, + op1_cfn_data, len, align, false) { m_fail_label = fail_label; } ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH] by_pieces: Properly set m_max_size in op_by_pieces 2021-08-03 21:22 ` H.J. Lu @ 2021-08-04 7:27 ` Richard Sandiford 2021-08-04 12:52 ` [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d H.J. Lu 0 siblings, 1 reply; 7+ messages in thread From: Richard Sandiford @ 2021-08-04 7:27 UTC (permalink / raw) To: H.J. Lu via Gcc-patches "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > @@ -1122,8 +1122,8 @@ class op_by_pieces_d > and its associated FROM_CFN_DATA can be used to replace loads with > constant values. LEN describes the length of the operation. */ > > -op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, > - rtx from, bool from_load, > +op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to, > + bool to_load, rtx from, bool from_load, > by_pieces_constfn from_cfn, > void *from_cfn_data, > unsigned HOST_WIDE_INT len, The comment above the function needs to describe the new parameter. OK with that change, thanks. Richard ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d 2021-08-04 7:27 ` Richard Sandiford @ 2021-08-04 12:52 ` H.J. Lu 0 siblings, 0 replies; 7+ messages in thread From: H.J. Lu @ 2021-08-04 12:52 UTC (permalink / raw) To: H.J. Lu via Gcc-patches, Richard Sandiford [-- Attachment #1: Type: text/plain, Size: 953 bytes --] On Wed, Aug 4, 2021 at 12:27 AM Richard Sandiford <richard.sandiford@arm.com> wrote: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > > @@ -1122,8 +1122,8 @@ class op_by_pieces_d > > and its associated FROM_CFN_DATA can be used to replace loads with > > constant values. LEN describes the length of the operation. */ > > > > -op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, > > - rtx from, bool from_load, > > +op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to, > > + bool to_load, rtx from, bool from_load, > > by_pieces_constfn from_cfn, > > void *from_cfn_data, > > unsigned HOST_WIDE_INT len, > > The comment above the function needs to describe the new parameter. > > OK with that change, thanks. > This is the patch I am checking in. Thanks. --- H.J. [-- Attachment #2: v2-0001-by_pieces-Pass-MAX_PIECES-to-op_by_pieces_d.patch --] [-- Type: text/x-patch, Size: 3642 bytes --] From 27343601ab064553eac695ed58e741c7b2f6059d Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.tools@gmail.com> Date: Tue, 3 Aug 2021 06:17:22 -0700 Subject: [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d Pass MAX_PIECES to op_by_pieces_d::op_by_pieces_d for move, store and compare. PR target/101742 * expr.c (op_by_pieces_d::op_by_pieces_d): Add a max_pieces argument to set m_max_size. (move_by_pieces_d): Pass MOVE_MAX_PIECES to op_by_pieces_d. (store_by_pieces_d): Pass STORE_MAX_PIECES to op_by_pieces_d. (compare_by_pieces_d): Pass COMPARE_MAX_PIECES to op_by_pieces_d. diff --git a/gcc/expr.c b/gcc/expr.c index b65cfcfdcd1..096c0315ecc 100644 --- a/gcc/expr.c +++ b/gcc/expr.c @@ -1110,8 +1110,8 @@ class op_by_pieces_d } public: - op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *, - unsigned HOST_WIDE_INT, unsigned int, bool, + op_by_pieces_d (unsigned int, rtx, bool, rtx, bool, by_pieces_constfn, + void *, unsigned HOST_WIDE_INT, unsigned int, bool, bool = false); void run (); }; @@ -1120,10 +1120,12 @@ class op_by_pieces_d objects named TO and FROM, which are identified as loads or stores by TO_LOAD and FROM_LOAD. If FROM is a load, the optional FROM_CFN and its associated FROM_CFN_DATA can be used to replace loads with - constant values. LEN describes the length of the operation. */ + constant values. MAX_PIECES describes the maximum number of bytes + at a time which can be moved efficiently. LEN describes the length + of the operation. */ -op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, - rtx from, bool from_load, +op_by_pieces_d::op_by_pieces_d (unsigned int max_pieces, rtx to, + bool to_load, rtx from, bool from_load, by_pieces_constfn from_cfn, void *from_cfn_data, unsigned HOST_WIDE_INT len, @@ -1131,7 +1133,7 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, bool qi_vector_mode) : m_to (to, to_load, NULL, NULL), m_from (from, from_load, from_cfn, from_cfn_data), - m_len (len), m_max_size (MOVE_MAX_PIECES + 1), + m_len (len), m_max_size (max_pieces + 1), m_push (push), m_qi_vector_mode (qi_vector_mode) { int toi = m_to.get_addr_inc (); @@ -1324,8 +1326,8 @@ class move_by_pieces_d : public op_by_pieces_d public: move_by_pieces_d (rtx to, rtx from, unsigned HOST_WIDE_INT len, unsigned int align) - : op_by_pieces_d (to, false, from, true, NULL, NULL, len, align, - PUSHG_P (to)) + : op_by_pieces_d (MOVE_MAX_PIECES, to, false, from, true, NULL, + NULL, len, align, PUSHG_P (to)) { } rtx finish_retmode (memop_ret); @@ -1421,8 +1423,8 @@ class store_by_pieces_d : public op_by_pieces_d store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data, unsigned HOST_WIDE_INT len, unsigned int align, bool qi_vector_mode) - : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len, - align, false, qi_vector_mode) + : op_by_pieces_d (STORE_MAX_PIECES, to, false, NULL_RTX, true, cfn, + cfn_data, len, align, false, qi_vector_mode) { } rtx finish_retmode (memop_ret); @@ -1618,8 +1620,8 @@ class compare_by_pieces_d : public op_by_pieces_d compare_by_pieces_d (rtx op0, rtx op1, by_pieces_constfn op1_cfn, void *op1_cfn_data, HOST_WIDE_INT len, int align, rtx_code_label *fail_label) - : op_by_pieces_d (op0, true, op1, true, op1_cfn, op1_cfn_data, len, - align, false) + : op_by_pieces_d (COMPARE_MAX_PIECES, op0, true, op1, true, op1_cfn, + op1_cfn_data, len, align, false) { m_fail_label = fail_label; } ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v2] x86: Update STORE_MAX_PIECES 2021-08-03 13:56 [PATCH] by_pieces: Properly set m_max_size in op_by_pieces H.J. Lu 2021-08-03 21:22 ` H.J. Lu @ 2021-08-04 13:33 ` H.J. Lu 2021-08-04 18:46 ` Uros Bizjak 1 sibling, 1 reply; 7+ messages in thread From: H.J. Lu @ 2021-08-04 13:33 UTC (permalink / raw) To: GCC Patches [-- Attachment #1: Type: text/plain, Size: 2666 bytes --] On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit > move is enabled since x86 uses vec_duplicate, which is enabled only when > inter-unit move is enabled, to implement store_by_pieces. > 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for > compare_by_pieces. > > gcc/ > > PR target/101742 > * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES > for compare_by_pieces. > * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode > only if TARGET_INTER_UNIT_MOVES_TO_VEC is true. > > gcc/testsuite/ > > PR target/101742 > * gcc.target/i386/pr101742a.c: New test. > * gcc.target/i386/pr101742b.c: Likewise. > --- > gcc/config/i386/i386.h | 20 +++++++++++--------- > gcc/expr.c | 6 +++++- > gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++ > 4 files changed, 36 insertions(+), 10 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index bed9cd9da18..9b416abd5f4 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -1783,15 +1783,17 @@ typedef struct ix86_args { > /* STORE_MAX_PIECES is the number of bytes at a time that we can > store efficiently. */ > #define STORE_MAX_PIECES \ > - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > - ? 64 \ > - : ((TARGET_AVX \ > - && !TARGET_PREFER_AVX128 \ > - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > - ? 32 \ > - : ((TARGET_SSE2 \ > - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > - ? 16 : UNITS_PER_WORD))) > + (TARGET_INTER_UNIT_MOVES_TO_VEC \ > + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > + ? 64 \ > + : ((TARGET_AVX \ > + && !TARGET_PREFER_AVX128 \ > + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > + ? 32 \ > + : ((TARGET_SSE2 \ > + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > + ? 16 : UNITS_PER_WORD))) \ > + : UNITS_PER_WORD) > > /* If a memory-to-memory move would take MOVE_RATIO or more simple > move-instruction pairs, we will do a cpymem or libcall instead. expr.c has been fixed. Here is the v2 patch for x86 backend. OK for master? Thanks. -- H.J. [-- Attachment #2: v2-0001-x86-Update-STORE_MAX_PIECES.patch --] [-- Type: application/x-patch, Size: 2930 bytes --] ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v2] x86: Update STORE_MAX_PIECES 2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu @ 2021-08-04 18:46 ` Uros Bizjak 2021-08-04 20:01 ` [PATCH v3] " H.J. Lu 0 siblings, 1 reply; 7+ messages in thread From: Uros Bizjak @ 2021-08-04 18:46 UTC (permalink / raw) To: H.J. Lu; +Cc: GCC Patches, Hongtao Liu On Wed, Aug 4, 2021 at 3:34 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit > > move is enabled since x86 uses vec_duplicate, which is enabled only when > > inter-unit move is enabled, to implement store_by_pieces. > > 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to > > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for > > compare_by_pieces. > > > > gcc/ > > > > PR target/101742 > > * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to > > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES > > for compare_by_pieces. > > * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode > > only if TARGET_INTER_UNIT_MOVES_TO_VEC is true. > > > > gcc/testsuite/ > > > > PR target/101742 > > * gcc.target/i386/pr101742a.c: New test. > > * gcc.target/i386/pr101742b.c: Likewise. > > --- > > gcc/config/i386/i386.h | 20 +++++++++++--------- > > gcc/expr.c | 6 +++++- > > gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++ > > gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++ > > 4 files changed, 36 insertions(+), 10 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > index bed9cd9da18..9b416abd5f4 100644 > > --- a/gcc/config/i386/i386.h > > +++ b/gcc/config/i386/i386.h > > @@ -1783,15 +1783,17 @@ typedef struct ix86_args { > > /* STORE_MAX_PIECES is the number of bytes at a time that we can > > store efficiently. */ > > #define STORE_MAX_PIECES \ > > - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > > - ? 64 \ > > - : ((TARGET_AVX \ > > - && !TARGET_PREFER_AVX128 \ > > - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > > - ? 32 \ > > - : ((TARGET_SSE2 \ > > - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > > - ? 16 : UNITS_PER_WORD))) > > + (TARGET_INTER_UNIT_MOVES_TO_VEC \ > > + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > > + ? 64 \ > > + : ((TARGET_AVX \ > > + && !TARGET_PREFER_AVX128 \ > > + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > > + ? 32 \ > > + : ((TARGET_SSE2 \ > > + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > > + ? 16 : UNITS_PER_WORD))) \ > > + : UNITS_PER_WORD) > > > > /* If a memory-to-memory move would take MOVE_RATIO or more simple > > move-instruction pairs, we will do a cpymem or libcall instead. > > expr.c has been fixed. Here is the v2 patch for x86 backend. > OK for master? OK, but please add the comment about vec_duplicate before the define to explain the situation with TARGET_INTER_UNIT_MOVES_TO_VEC. Thanks, Uros. ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v3] x86: Update STORE_MAX_PIECES 2021-08-04 18:46 ` Uros Bizjak @ 2021-08-04 20:01 ` H.J. Lu 0 siblings, 0 replies; 7+ messages in thread From: H.J. Lu @ 2021-08-04 20:01 UTC (permalink / raw) To: Uros Bizjak; +Cc: GCC Patches, Hongtao Liu [-- Attachment #1: Type: text/plain, Size: 3526 bytes --] On Wed, Aug 4, 2021 at 11:46 AM Uros Bizjak <ubizjak@gmail.com> wrote: > > On Wed, Aug 4, 2021 at 3:34 PM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > > > > > 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit > > > move is enabled since x86 uses vec_duplicate, which is enabled only when > > > inter-unit move is enabled, to implement store_by_pieces. > > > 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to > > > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for > > > compare_by_pieces. > > > > > > gcc/ > > > > > > PR target/101742 > > > * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to > > > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES > > > for compare_by_pieces. > > > * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode > > > only if TARGET_INTER_UNIT_MOVES_TO_VEC is true. > > > > > > gcc/testsuite/ > > > > > > PR target/101742 > > > * gcc.target/i386/pr101742a.c: New test. > > > * gcc.target/i386/pr101742b.c: Likewise. > > > --- > > > gcc/config/i386/i386.h | 20 +++++++++++--------- > > > gcc/expr.c | 6 +++++- > > > gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++ > > > gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++ > > > 4 files changed, 36 insertions(+), 10 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c > > > > > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > > > index bed9cd9da18..9b416abd5f4 100644 > > > --- a/gcc/config/i386/i386.h > > > +++ b/gcc/config/i386/i386.h > > > @@ -1783,15 +1783,17 @@ typedef struct ix86_args { > > > /* STORE_MAX_PIECES is the number of bytes at a time that we can > > > store efficiently. */ > > > #define STORE_MAX_PIECES \ > > > - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > > > - ? 64 \ > > > - : ((TARGET_AVX \ > > > - && !TARGET_PREFER_AVX128 \ > > > - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > > > - ? 32 \ > > > - : ((TARGET_SSE2 \ > > > - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > > > - ? 16 : UNITS_PER_WORD))) > > > + (TARGET_INTER_UNIT_MOVES_TO_VEC \ > > > + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > > > + ? 64 \ > > > + : ((TARGET_AVX \ > > > + && !TARGET_PREFER_AVX128 \ > > > + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > > > + ? 32 \ > > > + : ((TARGET_SSE2 \ > > > + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > > > + ? 16 : UNITS_PER_WORD))) \ > > > + : UNITS_PER_WORD) > > > > > > /* If a memory-to-memory move would take MOVE_RATIO or more simple > > > move-instruction pairs, we will do a cpymem or libcall instead. > > > > expr.c has been fixed. Here is the v2 patch for x86 backend. > > OK for master? > > OK, but please add the comment about vec_duplicate before the define > to explain the situation with TARGET_INTER_UNIT_MOVES_TO_VEC. This is what I am checking in with /* STORE_MAX_PIECES is the number of bytes at a time that we can store efficiently. Allow 16/32/64 bytes only if inter-unit move is enabled since vec_duplicate enabled by inter-unit move is used to implement store_by_pieces of 16/32/64 bytes. */ > Thanks, > Uros. Thanks. -- H.J. [-- Attachment #2: v3-0001-x86-Update-STORE_MAX_PIECES.patch --] [-- Type: text/x-patch, Size: 3284 bytes --] From 9487c165afb5b6083a3fc09a2e8b7bcabfe28765 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" <hjl.tools@gmail.com> Date: Tue, 3 Aug 2021 06:17:22 -0700 Subject: [PATCH v3] x86: Update STORE_MAX_PIECES Update STORE_MAX_PIECES to allow 16/32/64 bytes only if inter-unit move is enabled since vec_duplicate enabled by inter-unit move is used to implement store_by_pieces of 16/32/64 bytes. gcc/ PR target/101742 * config/i386/i386.h (STORE_MAX_PIECES): Allow 16/32/64 bytes only if TARGET_INTER_UNIT_MOVES_TO_VEC is true. gcc/testsuite/ PR target/101742 * gcc.target/i386/pr101742a.c: New test. * gcc.target/i386/pr101742b.c: Likewise. --- gcc/config/i386/i386.h | 26 +++++++++++++---------- gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++ gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++ 3 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index bed9cd9da18..21fe51bba40 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -1780,18 +1780,22 @@ typedef struct ix86_args { && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ ? 16 : UNITS_PER_WORD))) -/* STORE_MAX_PIECES is the number of bytes at a time that we can - store efficiently. */ +/* STORE_MAX_PIECES is the number of bytes at a time that we can store + efficiently. Allow 16/32/64 bytes only if inter-unit move is enabled + since vec_duplicate enabled by inter-unit move is used to implement + store_by_pieces of 16/32/64 bytes. */ #define STORE_MAX_PIECES \ - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ - ? 64 \ - : ((TARGET_AVX \ - && !TARGET_PREFER_AVX128 \ - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ - ? 32 \ - : ((TARGET_SSE2 \ - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ - ? 16 : UNITS_PER_WORD))) + (TARGET_INTER_UNIT_MOVES_TO_VEC \ + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ + ? 64 \ + : ((TARGET_AVX \ + && !TARGET_PREFER_AVX128 \ + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ + ? 32 \ + : ((TARGET_SSE2 \ + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ + ? 16 : UNITS_PER_WORD))) \ + : UNITS_PER_WORD) /* If a memory-to-memory move would take MOVE_RATIO or more simple move-instruction pairs, we will do a cpymem or libcall instead. diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c new file mode 100644 index 00000000000..67ea40587dd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101742a.c @@ -0,0 +1,16 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -mtune=nano-x2" } */ + +int n2; + +__attribute__ ((simd)) char +w7 (void) +{ + short int xb = n2; + int qp; + + for (qp = 0; qp < 2; ++qp) + xb = xb < 1; + + return xb; +} diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c new file mode 100644 index 00000000000..ba19064077b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr101742b.c @@ -0,0 +1,4 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */ + +#include "pr101742a.c" -- 2.31.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2021-08-04 20:02 UTC | newest] Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2021-08-03 13:56 [PATCH] by_pieces: Properly set m_max_size in op_by_pieces H.J. Lu 2021-08-03 21:22 ` H.J. Lu 2021-08-04 7:27 ` Richard Sandiford 2021-08-04 12:52 ` [PATCH v2] by_pieces: Pass MAX_PIECES to op_by_pieces_d H.J. Lu 2021-08-04 13:33 ` [PATCH v2] x86: Update STORE_MAX_PIECES H.J. Lu 2021-08-04 18:46 ` Uros Bizjak 2021-08-04 20:01 ` [PATCH v3] " H.J. Lu
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).