On Tue, Aug 3, 2021 at 6:56 AM H.J. Lu wrote: > > 1. Update x86 STORE_MAX_PIECES to use OImode and XImode only if inter-unit > move is enabled since x86 uses vec_duplicate, which is enabled only when > inter-unit move is enabled, to implement store_by_pieces. > 2. Update op_by_pieces_d::op_by_pieces_d to set m_max_size to > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES for > compare_by_pieces. > > gcc/ > > PR target/101742 > * expr.c (op_by_pieces_d::op_by_pieces_d): Set m_max_size to > STORE_MAX_PIECES for store_by_pieces and to COMPARE_MAX_PIECES > for compare_by_pieces. > * config/i386/i386.h (STORE_MAX_PIECES): Use OImode and XImode > only if TARGET_INTER_UNIT_MOVES_TO_VEC is true. > > gcc/testsuite/ > > PR target/101742 > * gcc.target/i386/pr101742a.c: New test. > * gcc.target/i386/pr101742b.c: Likewise. > --- > gcc/config/i386/i386.h | 20 +++++++++++--------- > gcc/expr.c | 6 +++++- > gcc/testsuite/gcc.target/i386/pr101742a.c | 16 ++++++++++++++++ > gcc/testsuite/gcc.target/i386/pr101742b.c | 4 ++++ > 4 files changed, 36 insertions(+), 10 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742a.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr101742b.c > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index bed9cd9da18..9b416abd5f4 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -1783,15 +1783,17 @@ typedef struct ix86_args { > /* STORE_MAX_PIECES is the number of bytes at a time that we can > store efficiently. */ > #define STORE_MAX_PIECES \ > - ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > - ? 64 \ > - : ((TARGET_AVX \ > - && !TARGET_PREFER_AVX128 \ > - && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > - ? 32 \ > - : ((TARGET_SSE2 \ > - && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > - ? 16 : UNITS_PER_WORD))) > + (TARGET_INTER_UNIT_MOVES_TO_VEC \ > + ? ((TARGET_AVX512F && !TARGET_PREFER_AVX256) \ > + ? 64 \ > + : ((TARGET_AVX \ > + && !TARGET_PREFER_AVX128 \ > + && !TARGET_AVX256_SPLIT_UNALIGNED_STORE) \ > + ? 32 \ > + : ((TARGET_SSE2 \ > + && TARGET_SSE_UNALIGNED_STORE_OPTIMAL) \ > + ? 16 : UNITS_PER_WORD))) \ > + : UNITS_PER_WORD) > > /* If a memory-to-memory move would take MOVE_RATIO or more simple > move-instruction pairs, we will do a cpymem or libcall instead. > diff --git a/gcc/expr.c b/gcc/expr.c > index b65cfcfdcd1..2964b38b9a5 100644 > --- a/gcc/expr.c > +++ b/gcc/expr.c > @@ -1131,7 +1131,11 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load, > bool qi_vector_mode) > : m_to (to, to_load, NULL, NULL), > m_from (from, from_load, from_cfn, from_cfn_data), > - m_len (len), m_max_size (MOVE_MAX_PIECES + 1), > + m_len (len), > + m_max_size (((!to_load && from == nullptr) > + ? STORE_MAX_PIECES > + : (from_cfn != nullptr > + ? COMPARE_MAX_PIECES : MOVE_MAX_PIECES)) + 1), > m_push (push), m_qi_vector_mode (qi_vector_mode) > { > int toi = m_to.get_addr_inc (); This larger expr.c patch passes the proper MAX_PIECES directly. > diff --git a/gcc/testsuite/gcc.target/i386/pr101742a.c b/gcc/testsuite/gcc.target/i386/pr101742a.c > new file mode 100644 > index 00000000000..67ea40587dd > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr101742a.c > @@ -0,0 +1,16 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -mtune=nano-x2" } */ > + > +int n2; > + > +__attribute__ ((simd)) char > +w7 (void) > +{ > + short int xb = n2; > + int qp; > + > + for (qp = 0; qp < 2; ++qp) > + xb = xb < 1; > + > + return xb; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr101742b.c b/gcc/testsuite/gcc.target/i386/pr101742b.c > new file mode 100644 > index 00000000000..ba19064077b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr101742b.c > @@ -0,0 +1,4 @@ > +/* { dg-do compile } */ > +/* { dg-additional-options "-O3 -mtune=nano-x2 -mtune-ctrl=sse_unaligned_store_optimal" } */ > + > +#include "pr101742a.c" > -- > 2.31.1 > -- H.J.