* [PATCH] Rewrite memset expanders with vec_duplicate
@ 2021-07-13 21:49 H.J. Lu
2021-07-16 11:38 ` Richard Sandiford
0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu @ 2021-07-13 21:49 UTC (permalink / raw)
To: gcc-patches; +Cc: Richard Biener, Jeff Law, Richard Sandiford
1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
scratch register to avoid stack realignment when expanding memset.
PR middle-end/90773
* builtins.c (gen_memset_value_from_prev): New function.
(gen_memset_broadcast): Likewise.
(builtin_memset_read_str): Use gen_memset_value_from_prev
and gen_memset_broadcast.
(builtin_memset_gen_str): Likewise.
* target.def (gen_memset_scratch_rtx): New hook.
* doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
* doc/tm.texi: Regenerated.
---
gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++--------
gcc/doc/tm.texi | 5 ++
gcc/doc/tm.texi.in | 2 +
gcc/target.def | 7 +++
4 files changed, 116 insertions(+), 21 deletions(-)
diff --git a/gcc/builtins.c b/gcc/builtins.c
index 39ab139b7e1..c1758ae2efc 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
return NULL_RTX;
}
-/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
- bytes from constant string DATA + OFFSET and return it as target
- constant. If PREV isn't nullptr, it has the RTL info from the
+/* Return the RTL of a register in MODE generated from PREV in the
previous iteration. */
-rtx
-builtin_memset_read_str (void *data, void *prevp,
- HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
- scalar_int_mode mode)
+static rtx
+gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
{
+ rtx target = nullptr;
by_pieces_prev *prev = (by_pieces_prev *) prevp;
if (prev != nullptr && prev->data != nullptr)
{
/* Use the previous data in the same mode. */
if (prev->mode == mode)
return prev->data;
+
+ rtx prev_rtx = prev->data;
+ machine_mode prev_mode = prev->mode;
+ unsigned int word_size = GET_MODE_SIZE (word_mode);
+ if (word_size < GET_MODE_SIZE (prev->mode)
+ && word_size > GET_MODE_SIZE (mode))
+ {
+ /* First generate subreg of word mode if the previous mode is
+ wider than word mode and word mode is wider than MODE. */
+ prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
+ prev_mode, 0);
+ prev_mode = word_mode;
+ }
+ if (prev_rtx != nullptr)
+ target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
}
+ return target;
+}
+
+/* Return the RTL of a register in MODE broadcasted from DATA. */
+
+static rtx
+gen_memset_broadcast (rtx data, scalar_int_mode mode)
+{
+ /* Skip if regno_reg_rtx isn't initialized. */
+ if (!regno_reg_rtx)
+ return nullptr;
+
+ rtx target = nullptr;
+
+ unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
+ machine_mode vector_mode;
+ if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
+ gcc_unreachable ();
+
+ enum insn_code icode = optab_handler (vec_duplicate_optab,
+ vector_mode);
+ if (icode != CODE_FOR_nothing)
+ {
+ rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
+ if (CONST_INT_P (data))
+ {
+ /* Use the move expander with CONST_VECTOR. */
+ rtx const_vec = gen_const_vec_duplicate (vector_mode, data);
+ emit_move_insn (reg, const_vec);
+ }
+ else
+ {
+
+ class expand_operand ops[2];
+ create_output_operand (&ops[0], reg, vector_mode);
+ create_input_operand (&ops[1], data, QImode);
+ expand_insn (icode, 2, ops);
+ if (!rtx_equal_p (reg, ops[0].value))
+ emit_move_insn (reg, ops[0].value);
+ }
+ target = lowpart_subreg (mode, reg, vector_mode);
+ }
+
+ return target;
+}
+
+/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
+ bytes from constant string DATA + OFFSET and return it as target
+ constant. If PREV isn't nullptr, it has the RTL info from the
+ previous iteration. */
+rtx
+builtin_memset_read_str (void *data, void *prev,
+ HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
+ scalar_int_mode mode)
+{
+ rtx target;
const char *c = (const char *) data;
- char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+ char *p;
+
+ /* Don't use the previous value if size is 1. */
+ if (GET_MODE_SIZE (mode) != 1)
+ {
+ target = gen_memset_value_from_prev (prev, mode);
+ if (target != nullptr)
+ return target;
+
+ p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
+ memset (p, *c, GET_MODE_SIZE (QImode));
+ rtx src = c_readstr (p, QImode);
+ target = gen_memset_broadcast (src, mode);
+ if (target != nullptr)
+ return target;
+ }
+
+ p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
memset (p, *c, GET_MODE_SIZE (mode));
@@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp,
nullptr, it has the RTL info from the previous iteration. */
static rtx
-builtin_memset_gen_str (void *data, void *prevp,
+builtin_memset_gen_str (void *data, void *prev,
HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
scalar_int_mode mode)
{
@@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp,
size_t size;
char *p;
- by_pieces_prev *prev = (by_pieces_prev *) prevp;
- if (prev != nullptr && prev->data != nullptr)
- {
- /* Use the previous data in the same mode. */
- if (prev->mode == mode)
- return prev->data;
-
- target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
- if (target != nullptr)
- return target;
- }
-
size = GET_MODE_SIZE (mode);
if (size == 1)
return (rtx) data;
+ target = gen_memset_value_from_prev (prev, mode);
+ if (target != nullptr)
+ return target;
+
+ target = gen_memset_broadcast ((rtx) data, mode);
+ if (target != nullptr)
+ return target;
+
p = XALLOCAVEC (char, size);
memset (p, 1, size);
coeff = c_readstr (p, mode);
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 2a41ae5fba1..8849711fcf8 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -12122,6 +12122,11 @@ This function prepares to emit a conditional comparison within a sequence
@var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
@end deftypefn
+@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
+This hook should return an rtx for scratch register in @var{mode} to
+be used by memset broadcast. The default is @code{gen_reg_rtx}.
+@end deftypefn
+
@deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
This target hook returns a new value for the number of times @var{loop}
should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f881cdabe9e..a6bbf4f2667 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7958,6 +7958,8 @@ lists.
@hook TARGET_GEN_CCMP_NEXT
+@hook TARGET_GEN_MEMSET_SCRATCH_RTX
+
@hook TARGET_LOOP_UNROLL_ADJUST
@defmac POWI_MAX_MULTS
diff --git a/gcc/target.def b/gcc/target.def
index c009671c583..5fb287db3bd 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2726,6 +2726,13 @@ DEFHOOK
rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
NULL)
+DEFHOOK
+(gen_memset_scratch_rtx,
+ "This hook should return an rtx for scratch register in @var{mode} to\n\
+be used by memset broadcast. The default is @code{gen_reg_rtx}.",
+ rtx, (machine_mode mode),
+ gen_reg_rtx)
+
/* Return a new value for loop unroll size. */
DEFHOOK
(loop_unroll_adjust,
--
2.31.1
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Rewrite memset expanders with vec_duplicate
2021-07-13 21:49 [PATCH] Rewrite memset expanders with vec_duplicate H.J. Lu
@ 2021-07-16 11:38 ` Richard Sandiford
2021-07-16 12:57 ` H.J. Lu
0 siblings, 1 reply; 14+ messages in thread
From: Richard Sandiford @ 2021-07-16 11:38 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches; +Cc: H.J. Lu, Jeff Law
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> scratch register to avoid stack realignment when expanding memset.
>
> PR middle-end/90773
> * builtins.c (gen_memset_value_from_prev): New function.
> (gen_memset_broadcast): Likewise.
> (builtin_memset_read_str): Use gen_memset_value_from_prev
> and gen_memset_broadcast.
> (builtin_memset_gen_str): Likewise.
> * target.def (gen_memset_scratch_rtx): New hook.
> * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> * doc/tm.texi: Regenerated.
> ---
> gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++--------
> gcc/doc/tm.texi | 5 ++
> gcc/doc/tm.texi.in | 2 +
> gcc/target.def | 7 +++
> 4 files changed, 116 insertions(+), 21 deletions(-)
>
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index 39ab139b7e1..c1758ae2efc 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
> return NULL_RTX;
> }
>
> -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> - bytes from constant string DATA + OFFSET and return it as target
> - constant. If PREV isn't nullptr, it has the RTL info from the
> +/* Return the RTL of a register in MODE generated from PREV in the
> previous iteration. */
>
> -rtx
> -builtin_memset_read_str (void *data, void *prevp,
> - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> - scalar_int_mode mode)
> +static rtx
> +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> {
> + rtx target = nullptr;
> by_pieces_prev *prev = (by_pieces_prev *) prevp;
> if (prev != nullptr && prev->data != nullptr)
> {
> /* Use the previous data in the same mode. */
> if (prev->mode == mode)
> return prev->data;
> +
> + rtx prev_rtx = prev->data;
> + machine_mode prev_mode = prev->mode;
> + unsigned int word_size = GET_MODE_SIZE (word_mode);
> + if (word_size < GET_MODE_SIZE (prev->mode)
> + && word_size > GET_MODE_SIZE (mode))
> + {
> + /* First generate subreg of word mode if the previous mode is
> + wider than word mode and word mode is wider than MODE. */
> + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> + prev_mode, 0);
> + prev_mode = word_mode;
> + }
> + if (prev_rtx != nullptr)
> + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> }
> + return target;
> +}
> +
> +/* Return the RTL of a register in MODE broadcasted from DATA. */
> +
> +static rtx
> +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> +{
> + /* Skip if regno_reg_rtx isn't initialized. */
> + if (!regno_reg_rtx)
> + return nullptr;
> +
> + rtx target = nullptr;
> +
> + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> + machine_mode vector_mode;
> + if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> + gcc_unreachable ();
Sorry, I realise it's a bit late to be raising this objection now,
but I don't think it's a good idea to use scalar integer modes as
a proxy for vector modes. In principle there's no reason why a
target has to define an integer mode for every vector mode.
If we want the mode to be a vector then I think the by-pieces
infrastructure should be extended to support vectors directly,
rather than assuming that each piece can be represented as
a scalar_int_mode.
Thanks,
Richard
> +
> + enum insn_code icode = optab_handler (vec_duplicate_optab,
> + vector_mode);
> + if (icode != CODE_FOR_nothing)
> + {
> + rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
> + if (CONST_INT_P (data))
> + {
> + /* Use the move expander with CONST_VECTOR. */
> + rtx const_vec = gen_const_vec_duplicate (vector_mode, data);
> + emit_move_insn (reg, const_vec);
> + }
> + else
> + {
> +
> + class expand_operand ops[2];
> + create_output_operand (&ops[0], reg, vector_mode);
> + create_input_operand (&ops[1], data, QImode);
> + expand_insn (icode, 2, ops);
> + if (!rtx_equal_p (reg, ops[0].value))
> + emit_move_insn (reg, ops[0].value);
> + }
> + target = lowpart_subreg (mode, reg, vector_mode);
> + }
> +
> + return target;
> +}
> +
> +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> + bytes from constant string DATA + OFFSET and return it as target
> + constant. If PREV isn't nullptr, it has the RTL info from the
> + previous iteration. */
>
> +rtx
> +builtin_memset_read_str (void *data, void *prev,
> + HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> + scalar_int_mode mode)
> +{
> + rtx target;
> const char *c = (const char *) data;
> - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> + char *p;
> +
> + /* Don't use the previous value if size is 1. */
> + if (GET_MODE_SIZE (mode) != 1)
> + {
> + target = gen_memset_value_from_prev (prev, mode);
> + if (target != nullptr)
> + return target;
> +
> + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
> + memset (p, *c, GET_MODE_SIZE (QImode));
> + rtx src = c_readstr (p, QImode);
> + target = gen_memset_broadcast (src, mode);
> + if (target != nullptr)
> + return target;
> + }
> +
> + p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
>
> memset (p, *c, GET_MODE_SIZE (mode));
>
> @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp,
> nullptr, it has the RTL info from the previous iteration. */
>
> static rtx
> -builtin_memset_gen_str (void *data, void *prevp,
> +builtin_memset_gen_str (void *data, void *prev,
> HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> scalar_int_mode mode)
> {
> @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp,
> size_t size;
> char *p;
>
> - by_pieces_prev *prev = (by_pieces_prev *) prevp;
> - if (prev != nullptr && prev->data != nullptr)
> - {
> - /* Use the previous data in the same mode. */
> - if (prev->mode == mode)
> - return prev->data;
> -
> - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> - if (target != nullptr)
> - return target;
> - }
> -
> size = GET_MODE_SIZE (mode);
> if (size == 1)
> return (rtx) data;
>
> + target = gen_memset_value_from_prev (prev, mode);
> + if (target != nullptr)
> + return target;
> +
> + target = gen_memset_broadcast ((rtx) data, mode);
> + if (target != nullptr)
> + return target;
> +
> p = XALLOCAVEC (char, size);
> memset (p, 1, size);
> coeff = c_readstr (p, mode);
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 2a41ae5fba1..8849711fcf8 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional comparison within a sequence
> @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
> @end deftypefn
>
> +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
> +This hook should return an rtx for scratch register in @var{mode} to
> +be used by memset broadcast. The default is @code{gen_reg_rtx}.
> +@end deftypefn
> +
> @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
> This target hook returns a new value for the number of times @var{loop}
> should be unrolled. The parameter @var{nunroll} is the number of times
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index f881cdabe9e..a6bbf4f2667 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7958,6 +7958,8 @@ lists.
>
> @hook TARGET_GEN_CCMP_NEXT
>
> +@hook TARGET_GEN_MEMSET_SCRATCH_RTX
> +
> @hook TARGET_LOOP_UNROLL_ADJUST
>
> @defmac POWI_MAX_MULTS
> diff --git a/gcc/target.def b/gcc/target.def
> index c009671c583..5fb287db3bd 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -2726,6 +2726,13 @@ DEFHOOK
> rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
> NULL)
>
> +DEFHOOK
> +(gen_memset_scratch_rtx,
> + "This hook should return an rtx for scratch register in @var{mode} to\n\
> +be used by memset broadcast. The default is @code{gen_reg_rtx}.",
> + rtx, (machine_mode mode),
> + gen_reg_rtx)
> +
> /* Return a new value for loop unroll size. */
> DEFHOOK
> (loop_unroll_adjust,
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Rewrite memset expanders with vec_duplicate
2021-07-16 11:38 ` Richard Sandiford
@ 2021-07-16 12:57 ` H.J. Lu
2021-07-16 13:23 ` Richard Sandiford
0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu @ 2021-07-16 12:57 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches, Jeff Law, Richard Sandiford
On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> > scratch register to avoid stack realignment when expanding memset.
> >
> > PR middle-end/90773
> > * builtins.c (gen_memset_value_from_prev): New function.
> > (gen_memset_broadcast): Likewise.
> > (builtin_memset_read_str): Use gen_memset_value_from_prev
> > and gen_memset_broadcast.
> > (builtin_memset_gen_str): Likewise.
> > * target.def (gen_memset_scratch_rtx): New hook.
> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> > * doc/tm.texi: Regenerated.
> > ---
> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++--------
> > gcc/doc/tm.texi | 5 ++
> > gcc/doc/tm.texi.in | 2 +
> > gcc/target.def | 7 +++
> > 4 files changed, 116 insertions(+), 21 deletions(-)
> >
> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> > index 39ab139b7e1..c1758ae2efc 100644
> > --- a/gcc/builtins.c
> > +++ b/gcc/builtins.c
> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
> > return NULL_RTX;
> > }
> >
> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> > - bytes from constant string DATA + OFFSET and return it as target
> > - constant. If PREV isn't nullptr, it has the RTL info from the
> > +/* Return the RTL of a register in MODE generated from PREV in the
> > previous iteration. */
> >
> > -rtx
> > -builtin_memset_read_str (void *data, void *prevp,
> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > - scalar_int_mode mode)
> > +static rtx
> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> > {
> > + rtx target = nullptr;
> > by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > if (prev != nullptr && prev->data != nullptr)
> > {
> > /* Use the previous data in the same mode. */
> > if (prev->mode == mode)
> > return prev->data;
> > +
> > + rtx prev_rtx = prev->data;
> > + machine_mode prev_mode = prev->mode;
> > + unsigned int word_size = GET_MODE_SIZE (word_mode);
> > + if (word_size < GET_MODE_SIZE (prev->mode)
> > + && word_size > GET_MODE_SIZE (mode))
> > + {
> > + /* First generate subreg of word mode if the previous mode is
> > + wider than word mode and word mode is wider than MODE. */
> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> > + prev_mode, 0);
> > + prev_mode = word_mode;
> > + }
> > + if (prev_rtx != nullptr)
> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> > }
> > + return target;
> > +}
> > +
> > +/* Return the RTL of a register in MODE broadcasted from DATA. */
> > +
> > +static rtx
> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> > +{
> > + /* Skip if regno_reg_rtx isn't initialized. */
> > + if (!regno_reg_rtx)
> > + return nullptr;
> > +
> > + rtx target = nullptr;
> > +
> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> > + machine_mode vector_mode;
> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> > + gcc_unreachable ();
>
> Sorry, I realise it's a bit late to be raising this objection now,
> but I don't think it's a good idea to use scalar integer modes as
> a proxy for vector modes. In principle there's no reason why a
> target has to define an integer mode for every vector mode.
A target always defines the largest integer mode.
> If we want the mode to be a vector then I think the by-pieces
> infrastructure should be extended to support vectors directly,
> rather than assuming that each piece can be represented as
> a scalar_int_mode.
>
The current by-pieces infrastructure operates on scalar_int_mode.
Only for memset, there is
/* Callback routine for store_by_pieces. Return the RTL of a register
containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
char value given in the RTL register data. For example, if mode is
4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't
nullptr, it has the RTL info from the previous iteration. */
static rtx
builtin_memset_gen_str (void *data, void *prevp,
HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
scalar_int_mode mode)
It is a broadcast. If a target can broadcast a byte to a wider integer,
can you suggest a way to use it in the current by-pieces infrastructure?
Thanks.
> Thanks,
> Richard
>
> > +
> > + enum insn_code icode = optab_handler (vec_duplicate_optab,
> > + vector_mode);
> > + if (icode != CODE_FOR_nothing)
> > + {
> > + rtx reg = targetm.gen_memset_scratch_rtx (vector_mode);
> > + if (CONST_INT_P (data))
> > + {
> > + /* Use the move expander with CONST_VECTOR. */
> > + rtx const_vec = gen_const_vec_duplicate (vector_mode, data);
> > + emit_move_insn (reg, const_vec);
> > + }
> > + else
> > + {
> > +
> > + class expand_operand ops[2];
> > + create_output_operand (&ops[0], reg, vector_mode);
> > + create_input_operand (&ops[1], data, QImode);
> > + expand_insn (icode, 2, ops);
> > + if (!rtx_equal_p (reg, ops[0].value))
> > + emit_move_insn (reg, ops[0].value);
> > + }
> > + target = lowpart_subreg (mode, reg, vector_mode);
> > + }
> > +
> > + return target;
> > +}
> > +
> > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> > + bytes from constant string DATA + OFFSET and return it as target
> > + constant. If PREV isn't nullptr, it has the RTL info from the
> > + previous iteration. */
> >
> > +rtx
> > +builtin_memset_read_str (void *data, void *prev,
> > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > + scalar_int_mode mode)
> > +{
> > + rtx target;
> > const char *c = (const char *) data;
> > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> > + char *p;
> > +
> > + /* Don't use the previous value if size is 1. */
> > + if (GET_MODE_SIZE (mode) != 1)
> > + {
> > + target = gen_memset_value_from_prev (prev, mode);
> > + if (target != nullptr)
> > + return target;
> > +
> > + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
> > + memset (p, *c, GET_MODE_SIZE (QImode));
> > + rtx src = c_readstr (p, QImode);
> > + target = gen_memset_broadcast (src, mode);
> > + if (target != nullptr)
> > + return target;
> > + }
> > +
> > + p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> >
> > memset (p, *c, GET_MODE_SIZE (mode));
> >
> > @@ -6719,7 +6804,7 @@ builtin_memset_read_str (void *data, void *prevp,
> > nullptr, it has the RTL info from the previous iteration. */
> >
> > static rtx
> > -builtin_memset_gen_str (void *data, void *prevp,
> > +builtin_memset_gen_str (void *data, void *prev,
> > HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > scalar_int_mode mode)
> > {
> > @@ -6727,22 +6812,18 @@ builtin_memset_gen_str (void *data, void *prevp,
> > size_t size;
> > char *p;
> >
> > - by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > - if (prev != nullptr && prev->data != nullptr)
> > - {
> > - /* Use the previous data in the same mode. */
> > - if (prev->mode == mode)
> > - return prev->data;
> > -
> > - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> > - if (target != nullptr)
> > - return target;
> > - }
> > -
> > size = GET_MODE_SIZE (mode);
> > if (size == 1)
> > return (rtx) data;
> >
> > + target = gen_memset_value_from_prev (prev, mode);
> > + if (target != nullptr)
> > + return target;
> > +
> > + target = gen_memset_broadcast ((rtx) data, mode);
> > + if (target != nullptr)
> > + return target;
> > +
> > p = XALLOCAVEC (char, size);
> > memset (p, 1, size);
> > coeff = c_readstr (p, mode);
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index 2a41ae5fba1..8849711fcf8 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -12122,6 +12122,11 @@ This function prepares to emit a conditional comparison within a sequence
> > @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
> > @end deftypefn
> >
> > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
> > +This hook should return an rtx for scratch register in @var{mode} to
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.
> > +@end deftypefn
> > +
> > @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
> > This target hook returns a new value for the number of times @var{loop}
> > should be unrolled. The parameter @var{nunroll} is the number of times
> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > index f881cdabe9e..a6bbf4f2667 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -7958,6 +7958,8 @@ lists.
> >
> > @hook TARGET_GEN_CCMP_NEXT
> >
> > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX
> > +
> > @hook TARGET_LOOP_UNROLL_ADJUST
> >
> > @defmac POWI_MAX_MULTS
> > diff --git a/gcc/target.def b/gcc/target.def
> > index c009671c583..5fb287db3bd 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -2726,6 +2726,13 @@ DEFHOOK
> > rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
> > NULL)
> >
> > +DEFHOOK
> > +(gen_memset_scratch_rtx,
> > + "This hook should return an rtx for scratch register in @var{mode} to\n\
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.",
> > + rtx, (machine_mode mode),
> > + gen_reg_rtx)
> > +
> > /* Return a new value for loop unroll size. */
> > DEFHOOK
> > (loop_unroll_adjust,
--
H.J.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Rewrite memset expanders with vec_duplicate
2021-07-16 12:57 ` H.J. Lu
@ 2021-07-16 13:23 ` Richard Sandiford
2021-07-16 14:15 ` H.J. Lu
0 siblings, 1 reply; 14+ messages in thread
From: Richard Sandiford @ 2021-07-16 13:23 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches; +Cc: Jeff Law, H.J. Lu
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
>> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
>> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
>> > scratch register to avoid stack realignment when expanding memset.
>> >
>> > PR middle-end/90773
>> > * builtins.c (gen_memset_value_from_prev): New function.
>> > (gen_memset_broadcast): Likewise.
>> > (builtin_memset_read_str): Use gen_memset_value_from_prev
>> > and gen_memset_broadcast.
>> > (builtin_memset_gen_str): Likewise.
>> > * target.def (gen_memset_scratch_rtx): New hook.
>> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
>> > * doc/tm.texi: Regenerated.
>> > ---
>> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++--------
>> > gcc/doc/tm.texi | 5 ++
>> > gcc/doc/tm.texi.in | 2 +
>> > gcc/target.def | 7 +++
>> > 4 files changed, 116 insertions(+), 21 deletions(-)
>> >
>> > diff --git a/gcc/builtins.c b/gcc/builtins.c
>> > index 39ab139b7e1..c1758ae2efc 100644
>> > --- a/gcc/builtins.c
>> > +++ b/gcc/builtins.c
>> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
>> > return NULL_RTX;
>> > }
>> >
>> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
>> > - bytes from constant string DATA + OFFSET and return it as target
>> > - constant. If PREV isn't nullptr, it has the RTL info from the
>> > +/* Return the RTL of a register in MODE generated from PREV in the
>> > previous iteration. */
>> >
>> > -rtx
>> > -builtin_memset_read_str (void *data, void *prevp,
>> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
>> > - scalar_int_mode mode)
>> > +static rtx
>> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
>> > {
>> > + rtx target = nullptr;
>> > by_pieces_prev *prev = (by_pieces_prev *) prevp;
>> > if (prev != nullptr && prev->data != nullptr)
>> > {
>> > /* Use the previous data in the same mode. */
>> > if (prev->mode == mode)
>> > return prev->data;
>> > +
>> > + rtx prev_rtx = prev->data;
>> > + machine_mode prev_mode = prev->mode;
>> > + unsigned int word_size = GET_MODE_SIZE (word_mode);
>> > + if (word_size < GET_MODE_SIZE (prev->mode)
>> > + && word_size > GET_MODE_SIZE (mode))
>> > + {
>> > + /* First generate subreg of word mode if the previous mode is
>> > + wider than word mode and word mode is wider than MODE. */
>> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
>> > + prev_mode, 0);
>> > + prev_mode = word_mode;
>> > + }
>> > + if (prev_rtx != nullptr)
>> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
>> > }
>> > + return target;
>> > +}
>> > +
>> > +/* Return the RTL of a register in MODE broadcasted from DATA. */
>> > +
>> > +static rtx
>> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
>> > +{
>> > + /* Skip if regno_reg_rtx isn't initialized. */
>> > + if (!regno_reg_rtx)
>> > + return nullptr;
>> > +
>> > + rtx target = nullptr;
>> > +
>> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
>> > + machine_mode vector_mode;
>> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
>> > + gcc_unreachable ();
>>
>> Sorry, I realise it's a bit late to be raising this objection now,
>> but I don't think it's a good idea to use scalar integer modes as
>> a proxy for vector modes. In principle there's no reason why a
>> target has to define an integer mode for every vector mode.
>
> A target always defines the largest integer mode.
Right. But a target shouldn't *need* to define an integer mode
for every vector mode.
>> If we want the mode to be a vector then I think the by-pieces
>> infrastructure should be extended to support vectors directly,
>> rather than assuming that each piece can be represented as
>> a scalar_int_mode.
>>
>
> The current by-pieces infrastructure operates on scalar_int_mode.
> Only for memset, there is
>
> /* Callback routine for store_by_pieces. Return the RTL of a register
> containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
> char value given in the RTL register data. For example, if mode is
> 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't
> nullptr, it has the RTL info from the previous iteration. */
>
> static rtx
> builtin_memset_gen_str (void *data, void *prevp,
> HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> scalar_int_mode mode)
>
> It is a broadcast. If a target can broadcast a byte to a wider integer,
> can you suggest a way to use it in the current by-pieces infrastructure?
I meant that we should extend the by-pieces infrastructure so that
it no longer requires scalar_int_mode, rather than work within the
current limits.
IMO the fact that we're (legitimately) going through vec_duplicate_optab
shows that we really are dealing with vectors here, not integers.
Thanks,
Richard
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Rewrite memset expanders with vec_duplicate
2021-07-16 13:23 ` Richard Sandiford
@ 2021-07-16 14:15 ` H.J. Lu
2021-07-16 23:57 ` [PATCH] Add QI vector mode support to by-pieces for memset H.J. Lu
0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu @ 2021-07-16 14:15 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches, Jeff Law, H.J. Lu, Richard Sandiford
On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
> > <richard.sandiford@arm.com> wrote:
> >>
> >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> >> > scratch register to avoid stack realignment when expanding memset.
> >> >
> >> > PR middle-end/90773
> >> > * builtins.c (gen_memset_value_from_prev): New function.
> >> > (gen_memset_broadcast): Likewise.
> >> > (builtin_memset_read_str): Use gen_memset_value_from_prev
> >> > and gen_memset_broadcast.
> >> > (builtin_memset_gen_str): Likewise.
> >> > * target.def (gen_memset_scratch_rtx): New hook.
> >> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> >> > * doc/tm.texi: Regenerated.
> >> > ---
> >> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++--------
> >> > gcc/doc/tm.texi | 5 ++
> >> > gcc/doc/tm.texi.in | 2 +
> >> > gcc/target.def | 7 +++
> >> > 4 files changed, 116 insertions(+), 21 deletions(-)
> >> >
> >> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> >> > index 39ab139b7e1..c1758ae2efc 100644
> >> > --- a/gcc/builtins.c
> >> > +++ b/gcc/builtins.c
> >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
> >> > return NULL_RTX;
> >> > }
> >> >
> >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> >> > - bytes from constant string DATA + OFFSET and return it as target
> >> > - constant. If PREV isn't nullptr, it has the RTL info from the
> >> > +/* Return the RTL of a register in MODE generated from PREV in the
> >> > previous iteration. */
> >> >
> >> > -rtx
> >> > -builtin_memset_read_str (void *data, void *prevp,
> >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >> > - scalar_int_mode mode)
> >> > +static rtx
> >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> >> > {
> >> > + rtx target = nullptr;
> >> > by_pieces_prev *prev = (by_pieces_prev *) prevp;
> >> > if (prev != nullptr && prev->data != nullptr)
> >> > {
> >> > /* Use the previous data in the same mode. */
> >> > if (prev->mode == mode)
> >> > return prev->data;
> >> > +
> >> > + rtx prev_rtx = prev->data;
> >> > + machine_mode prev_mode = prev->mode;
> >> > + unsigned int word_size = GET_MODE_SIZE (word_mode);
> >> > + if (word_size < GET_MODE_SIZE (prev->mode)
> >> > + && word_size > GET_MODE_SIZE (mode))
> >> > + {
> >> > + /* First generate subreg of word mode if the previous mode is
> >> > + wider than word mode and word mode is wider than MODE. */
> >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> >> > + prev_mode, 0);
> >> > + prev_mode = word_mode;
> >> > + }
> >> > + if (prev_rtx != nullptr)
> >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> >> > }
> >> > + return target;
> >> > +}
> >> > +
> >> > +/* Return the RTL of a register in MODE broadcasted from DATA. */
> >> > +
> >> > +static rtx
> >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> >> > +{
> >> > + /* Skip if regno_reg_rtx isn't initialized. */
> >> > + if (!regno_reg_rtx)
> >> > + return nullptr;
> >> > +
> >> > + rtx target = nullptr;
> >> > +
> >> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> >> > + machine_mode vector_mode;
> >> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> >> > + gcc_unreachable ();
> >>
> >> Sorry, I realise it's a bit late to be raising this objection now,
> >> but I don't think it's a good idea to use scalar integer modes as
> >> a proxy for vector modes. In principle there's no reason why a
> >> target has to define an integer mode for every vector mode.
> >
> > A target always defines the largest integer mode.
>
> Right. But a target shouldn't *need* to define an integer mode
> for every vector mode.
>
> >> If we want the mode to be a vector then I think the by-pieces
> >> infrastructure should be extended to support vectors directly,
> >> rather than assuming that each piece can be represented as
> >> a scalar_int_mode.
> >>
> >
> > The current by-pieces infrastructure operates on scalar_int_mode.
> > Only for memset, there is
> >
> > /* Callback routine for store_by_pieces. Return the RTL of a register
> > containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
> > char value given in the RTL register data. For example, if mode is
> > 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't
> > nullptr, it has the RTL info from the previous iteration. */
> >
> > static rtx
> > builtin_memset_gen_str (void *data, void *prevp,
> > HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > scalar_int_mode mode)
> >
> > It is a broadcast. If a target can broadcast a byte to a wider integer,
> > can you suggest a way to use it in the current by-pieces infrastructure?
>
> I meant that we should extend the by-pieces infrastructure so that
> it no longer requires scalar_int_mode, rather than work within the
> current limits.
>
> IMO the fact that we're (legitimately) going through vec_duplicate_optab
If vec_duplicate_optab is the only blocking issue, can we add an integer
broadcast optab? I want to avoid a major surgery just because we
don't support integer broadcast from byte.
> shows that we really are dealing with vectors here, not integers.
>
Are you suggesting we should add the optional vector mode
support to by-pieces for memset?
--
H.J.
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH] Add QI vector mode support to by-pieces for memset
2021-07-16 14:15 ` H.J. Lu
@ 2021-07-16 23:57 ` H.J. Lu
2021-07-19 14:40 ` Richard Sandiford
0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu @ 2021-07-16 23:57 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches, Jeff Law, Richard Sandiford
[-- Attachment #1: Type: text/plain, Size: 6678 bytes --]
On Fri, Jul 16, 2021 at 7:15 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
> > > <richard.sandiford@arm.com> wrote:
> > >>
> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> > >> > scratch register to avoid stack realignment when expanding memset.
> > >> >
> > >> > PR middle-end/90773
> > >> > * builtins.c (gen_memset_value_from_prev): New function.
> > >> > (gen_memset_broadcast): Likewise.
> > >> > (builtin_memset_read_str): Use gen_memset_value_from_prev
> > >> > and gen_memset_broadcast.
> > >> > (builtin_memset_gen_str): Likewise.
> > >> > * target.def (gen_memset_scratch_rtx): New hook.
> > >> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> > >> > * doc/tm.texi: Regenerated.
> > >> > ---
> > >> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++--------
> > >> > gcc/doc/tm.texi | 5 ++
> > >> > gcc/doc/tm.texi.in | 2 +
> > >> > gcc/target.def | 7 +++
> > >> > 4 files changed, 116 insertions(+), 21 deletions(-)
> > >> >
> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> > >> > index 39ab139b7e1..c1758ae2efc 100644
> > >> > --- a/gcc/builtins.c
> > >> > +++ b/gcc/builtins.c
> > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
> > >> > return NULL_RTX;
> > >> > }
> > >> >
> > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> > >> > - bytes from constant string DATA + OFFSET and return it as target
> > >> > - constant. If PREV isn't nullptr, it has the RTL info from the
> > >> > +/* Return the RTL of a register in MODE generated from PREV in the
> > >> > previous iteration. */
> > >> >
> > >> > -rtx
> > >> > -builtin_memset_read_str (void *data, void *prevp,
> > >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > >> > - scalar_int_mode mode)
> > >> > +static rtx
> > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> > >> > {
> > >> > + rtx target = nullptr;
> > >> > by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > >> > if (prev != nullptr && prev->data != nullptr)
> > >> > {
> > >> > /* Use the previous data in the same mode. */
> > >> > if (prev->mode == mode)
> > >> > return prev->data;
> > >> > +
> > >> > + rtx prev_rtx = prev->data;
> > >> > + machine_mode prev_mode = prev->mode;
> > >> > + unsigned int word_size = GET_MODE_SIZE (word_mode);
> > >> > + if (word_size < GET_MODE_SIZE (prev->mode)
> > >> > + && word_size > GET_MODE_SIZE (mode))
> > >> > + {
> > >> > + /* First generate subreg of word mode if the previous mode is
> > >> > + wider than word mode and word mode is wider than MODE. */
> > >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> > >> > + prev_mode, 0);
> > >> > + prev_mode = word_mode;
> > >> > + }
> > >> > + if (prev_rtx != nullptr)
> > >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> > >> > }
> > >> > + return target;
> > >> > +}
> > >> > +
> > >> > +/* Return the RTL of a register in MODE broadcasted from DATA. */
> > >> > +
> > >> > +static rtx
> > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> > >> > +{
> > >> > + /* Skip if regno_reg_rtx isn't initialized. */
> > >> > + if (!regno_reg_rtx)
> > >> > + return nullptr;
> > >> > +
> > >> > + rtx target = nullptr;
> > >> > +
> > >> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> > >> > + machine_mode vector_mode;
> > >> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> > >> > + gcc_unreachable ();
> > >>
> > >> Sorry, I realise it's a bit late to be raising this objection now,
> > >> but I don't think it's a good idea to use scalar integer modes as
> > >> a proxy for vector modes. In principle there's no reason why a
> > >> target has to define an integer mode for every vector mode.
> > >
> > > A target always defines the largest integer mode.
> >
> > Right. But a target shouldn't *need* to define an integer mode
> > for every vector mode.
> >
> > >> If we want the mode to be a vector then I think the by-pieces
> > >> infrastructure should be extended to support vectors directly,
> > >> rather than assuming that each piece can be represented as
> > >> a scalar_int_mode.
> > >>
> > >
> > > The current by-pieces infrastructure operates on scalar_int_mode.
> > > Only for memset, there is
> > >
> > > /* Callback routine for store_by_pieces. Return the RTL of a register
> > > containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
> > > char value given in the RTL register data. For example, if mode is
> > > 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't
> > > nullptr, it has the RTL info from the previous iteration. */
> > >
> > > static rtx
> > > builtin_memset_gen_str (void *data, void *prevp,
> > > HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > > scalar_int_mode mode)
> > >
> > > It is a broadcast. If a target can broadcast a byte to a wider integer,
> > > can you suggest a way to use it in the current by-pieces infrastructure?
> >
> > I meant that we should extend the by-pieces infrastructure so that
> > it no longer requires scalar_int_mode, rather than work within the
> > current limits.
> >
> > IMO the fact that we're (legitimately) going through vec_duplicate_optab
>
> If vec_duplicate_optab is the only blocking issue, can we add an integer
> broadcast optab? I want to avoid a major surgery just because we
> don't support integer broadcast from byte.
>
> > shows that we really are dealing with vectors here, not integers.
> >
>
> Are you suggesting we should add the optional vector mode
> support to by-pieces for memset?
>
Like this?
1. Replace scalar_int_mode with machine_mode in the by-pieces
infrastructure to allow non-integer mode.
2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
to return QI vector mode for memset.
3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
integer or QI vector mode.
Thanks.
--
H.J.
[-- Attachment #2: 0001-Add-QI-vector-mode-support-to-by-pieces-for-memset.patch --]
[-- Type: text/x-patch, Size: 26658 bytes --]
From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 6 Mar 2016 06:38:21 -0800
Subject: [PATCH] Add QI vector mode support to by-pieces for memset
1. Replace scalar_int_mode with machine_mode in the by-pieces
infrastructure to allow non-integer mode.
2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
to return QI vector mode for memset.
3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
integer or QI vector mode.
gcc/
PR middle-end/90773
* builtins.c (builtin_memcpy_read_str): Change the mode argument
from scalar_int_mode to machine_mode.
(builtin_strncpy_read_str): Likewise.
(gen_memset_value_from_prev): New function.
(gen_memset_broadcast): Likewise.
(builtin_memset_read_str): Change the mode argument from
scalar_int_mode to machine_mode. Use gen_memset_value_from_prev
and gen_memset_broadcast.
(builtin_memset_gen_str): Likewise.
(try_store_by_multiple_pieces): Use by_pieces_constfn to declare
constfun.
* builtins.h (builtin_strncpy_read_str): Updated.
(builtin_memset_read_str): Likewise.
* expr.c (widest_int_mode_for_size): Renamed to ...
(widest_int_or_QI_vector_mode_for_size): Add a bool argument to
indicate if QI vector mode can be used.
(by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size
instead of widest_int_mode_for_size.
(pieces_addr::adjust): Change the mode argument from
scalar_int_mode to machine_mode.
(op_by_pieces_d): Add a bool member, m_qi_vector_mode, to
indicate that QI vector mode can be used.
(op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize
m_qi_vector_mode. Call widest_int_or_QI_vector_mode_for_size
instead of widest_int_mode_for_size.
(op_by_pieces_d::get_usable_mode): Change the mode argument from
scalar_int_mode to machine_mode. Call
widest_int_or_QI_vector_mode_for_size instead of
widest_int_mode_for_size.
(op_by_pieces_d::smallest_mode_for_size): New member function
to return the smallest integer or QI vector mode.
(op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size
instead of widest_int_mode_for_size. Call smallest_mode_for_size
instead of smallest_int_mode_for_size.
(store_by_pieces_d::store_by_pieces_d): Add a bool argument to
indicate that QI vector mode can be used and pass it to
op_by_pieces_d::op_by_pieces_d.
(can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size
instead of widest_int_mode_for_size.
(store_by_pieces::store_by_pieces): Pass memsetp to
store_by_pieces_d::store_by_pieces_d.
(clear_by_pieces_1): Change scalar_int_mode to machine_mode.
(string_cst_read_str): Change the mode argument from
scalar_int_mode to machine_mode.
* expr.h (by_pieces_constfn): Change scalar_int_mode to
machine_mode.
(by_pieces_prev): Likewise.
* target.def (gen_memset_scratch_rtx): New hook.
* doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
* doc/tm.texi: Regenerated.
gcc/testsuite/
* gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of
vmovdqu.
* gcc.target/i386/pr100865-4b.c: Likewise.
---
gcc/builtins.c | 138 ++++++++++++++-----
gcc/builtins.h | 4 +-
gcc/doc/tm.texi | 5 +
gcc/doc/tm.texi.in | 2 +
gcc/expr.c | 145 ++++++++++++++------
gcc/expr.h | 4 +-
gcc/target.def | 7 +
gcc/testsuite/gcc.target/i386/pr100865-3.c | 2 +-
gcc/testsuite/gcc.target/i386/pr100865-4b.c | 2 +-
9 files changed, 229 insertions(+), 80 deletions(-)
diff --git a/gcc/builtins.c b/gcc/builtins.c
index 39ab139b7e1..876378b467c 100644
--- a/gcc/builtins.c
+++ b/gcc/builtins.c
@@ -3890,13 +3890,14 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)
static rtx
builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
- scalar_int_mode mode)
+ machine_mode mode)
{
/* The REPresentation pointed to by DATA need not be a nul-terminated
string but the caller guarantees it's large enough for MODE. */
const char *rep = (const char *) data;
- return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
+ return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
+ /*nul_terminated=*/false);
}
/* LEN specify length of the block of memcpy/memset operation.
@@ -6478,14 +6479,14 @@ expand_builtin_stpncpy (tree exp, rtx)
rtx
builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
- scalar_int_mode mode)
+ machine_mode mode)
{
const char *str = (const char *) data;
if ((unsigned HOST_WIDE_INT) offset > strlen (str))
return const0_rtx;
- return c_readstr (str + offset, mode);
+ return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
}
/* Helper to check the sizes of sequences and the destination of calls
@@ -6686,30 +6687,109 @@ expand_builtin_strncpy (tree exp, rtx target)
return NULL_RTX;
}
-/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
- bytes from constant string DATA + OFFSET and return it as target
- constant. If PREV isn't nullptr, it has the RTL info from the
+/* Return the RTL of a register in MODE generated from PREV in the
previous iteration. */
-rtx
-builtin_memset_read_str (void *data, void *prevp,
- HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
- scalar_int_mode mode)
+static rtx
+gen_memset_value_from_prev (void *prevp, machine_mode mode)
{
+ rtx target = nullptr;
by_pieces_prev *prev = (by_pieces_prev *) prevp;
if (prev != nullptr && prev->data != nullptr)
{
/* Use the previous data in the same mode. */
if (prev->mode == mode)
return prev->data;
+
+ rtx prev_rtx = prev->data;
+ machine_mode prev_mode = prev->mode;
+ unsigned int word_size = GET_MODE_SIZE (word_mode);
+ if (word_size < GET_MODE_SIZE (prev->mode).to_constant ()
+ && word_size > GET_MODE_SIZE (mode).to_constant ())
+ {
+ /* First generate subreg of word mode if the previous mode is
+ wider than word mode and word mode is wider than MODE. */
+ prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
+ prev_mode, 0);
+ prev_mode = word_mode;
+ }
+ if (prev_rtx != nullptr)
+ target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
+ }
+ return target;
+}
+
+/* Return the RTL of a register in MODE broadcasted from DATA. */
+
+static rtx
+gen_memset_broadcast (rtx data, machine_mode mode)
+{
+ /* Skip if regno_reg_rtx isn't initialized or not vector mode. */
+ if (!regno_reg_rtx || !VECTOR_MODE_P (mode))
+ return nullptr;
+
+ gcc_assert (GET_MODE_INNER (mode) == QImode);
+
+ enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
+
+ gcc_assert (icode != CODE_FOR_nothing);
+
+ rtx target = targetm.gen_memset_scratch_rtx (mode);
+ if (CONST_INT_P (data))
+ {
+ /* Use the move expander with CONST_VECTOR. */
+ rtx const_vec = gen_const_vec_duplicate (mode, data);
+ emit_move_insn (target, const_vec);
+ }
+ else
+ {
+
+ class expand_operand ops[2];
+ create_output_operand (&ops[0], target, mode);
+ create_input_operand (&ops[1], data, QImode);
+ expand_insn (icode, 2, ops);
+ if (!rtx_equal_p (target, ops[0].value))
+ emit_move_insn (target, ops[0].value);
}
+ return target;
+}
+
+/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
+ bytes from constant string DATA + OFFSET and return it as target
+ constant. If PREV isn't nullptr, it has the RTL info from the
+ previous iteration. */
+
+rtx
+builtin_memset_read_str (void *data, void *prev,
+ HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
+ machine_mode mode)
+{
+ rtx target;
const char *c = (const char *) data;
- char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+ char *p;
+ unsigned int size = GET_MODE_SIZE (mode).to_constant ();
+
+ /* Don't use the previous value if size is 1. */
+ if (size != 1)
+ {
+ target = gen_memset_value_from_prev (prev, mode);
+ if (target != nullptr)
+ return target;
+
+ p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
+ memset (p, *c, GET_MODE_SIZE (QImode));
+ rtx src = c_readstr (p, QImode);
+ target = gen_memset_broadcast (src, mode);
+ if (target != nullptr)
+ return target;
+ }
+
+ p = XALLOCAVEC (char, size);
- memset (p, *c, GET_MODE_SIZE (mode));
+ memset (p, *c, size);
- return c_readstr (p, mode);
+ return c_readstr (p, as_a <scalar_int_mode> (mode));
}
/* Callback routine for store_by_pieces. Return the RTL of a register
@@ -6719,33 +6799,29 @@ builtin_memset_read_str (void *data, void *prevp,
nullptr, it has the RTL info from the previous iteration. */
static rtx
-builtin_memset_gen_str (void *data, void *prevp,
+builtin_memset_gen_str (void *data, void *prev,
HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
- scalar_int_mode mode)
+ machine_mode mode)
{
rtx target, coeff;
size_t size;
char *p;
- by_pieces_prev *prev = (by_pieces_prev *) prevp;
- if (prev != nullptr && prev->data != nullptr)
- {
- /* Use the previous data in the same mode. */
- if (prev->mode == mode)
- return prev->data;
-
- target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
- if (target != nullptr)
- return target;
- }
-
- size = GET_MODE_SIZE (mode);
+ size = GET_MODE_SIZE (mode).to_constant ();
if (size == 1)
return (rtx) data;
+ target = gen_memset_value_from_prev (prev, mode);
+ if (target != nullptr)
+ return target;
+
+ target = gen_memset_broadcast ((rtx) data, mode);
+ if (target != nullptr)
+ return target;
+
p = XALLOCAVEC (char, size);
memset (p, 1, size);
- coeff = c_readstr (p, mode);
+ coeff = c_readstr (p, as_a <scalar_int_mode> (mode));
target = convert_to_mode (mode, (rtx) data, 1);
target = expand_mult (mode, target, coeff, NULL_RTX, 1);
@@ -6849,7 +6925,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
&valc, align, true))
return false;
- rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);
+ by_pieces_constfn constfun;
void *constfundata;
if (val)
{
diff --git a/gcc/builtins.h b/gcc/builtins.h
index a64ece3f1cd..31d07621750 100644
--- a/gcc/builtins.h
+++ b/gcc/builtins.h
@@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn);
extern tree mathfn_built_in (tree, combined_fn);
extern tree mathfn_built_in_type (combined_fn);
extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
- scalar_int_mode);
+ machine_mode);
extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
- scalar_int_mode);
+ machine_mode);
extern rtx expand_builtin_saveregs (void);
extern tree std_build_builtin_va_list (void);
extern tree std_fn_abi_va_list (tree);
diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
index 3ad39443eba..c1995270720 100644
--- a/gcc/doc/tm.texi
+++ b/gcc/doc/tm.texi
@@ -12123,6 +12123,11 @@ This function prepares to emit a conditional comparison within a sequence
@var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
@end deftypefn
+@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
+This hook should return an rtx for scratch register in @var{mode} to
+be used by memset broadcast. The default is @code{gen_reg_rtx}.
+@end deftypefn
+
@deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
This target hook returns a new value for the number of times @var{loop}
should be unrolled. The parameter @var{nunroll} is the number of times
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index f881cdabe9e..a6bbf4f2667 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -7958,6 +7958,8 @@ lists.
@hook TARGET_GEN_CCMP_NEXT
+@hook TARGET_GEN_MEMSET_SCRATCH_RTX
+
@hook TARGET_LOOP_UNROLL_ADJUST
@defmac POWI_MAX_MULTS
diff --git a/gcc/expr.c b/gcc/expr.c
index 6a4368113c4..92dbf47194e 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
return align;
}
-/* Return the widest integer mode that is narrower than SIZE bytes. */
+/* Return the widest integer or QI vector mode that is narrower than
+ SIZE bytes. */
-static scalar_int_mode
-widest_int_mode_for_size (unsigned int size)
+static machine_mode
+widest_int_or_QI_vector_mode_for_size (unsigned int size,
+ bool qi_vector)
{
- scalar_int_mode result = NARROWEST_INT_MODE;
+ machine_mode result = NARROWEST_INT_MODE;
gcc_checking_assert (size > 1);
+ /* Use QI vector only if size is wider than a WORD. */
+ if (qi_vector && size > GET_MODE_SIZE (word_mode))
+ {
+ machine_mode mode;
+ FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
+ if (GET_MODE_INNER (mode) == QImode
+ && GET_MODE_SIZE (mode).is_constant ())
+ {
+ if (GET_MODE_SIZE (mode).to_constant () >= size)
+ break;
+ if (optab_handler (vec_duplicate_optab, mode)
+ != CODE_FOR_nothing)
+ result = mode;
+ }
+
+ if (result != NARROWEST_INT_MODE)
+ return result;
+ }
+
opt_scalar_int_mode tmode;
FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
if (GET_MODE_SIZE (tmode.require ()) < size)
@@ -815,16 +836,18 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
unsigned int max_size, by_pieces_operation op)
{
unsigned HOST_WIDE_INT n_insns = 0;
- scalar_int_mode mode;
+ machine_mode mode;
if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
{
/* NB: Round up L and ALIGN to the widest integer mode for
MAX_SIZE. */
- mode = widest_int_mode_for_size (max_size);
+ mode = widest_int_or_QI_vector_mode_for_size (max_size,
+ op == SET_BY_PIECES);
if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
{
- unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
+ unsigned HOST_WIDE_INT up = ROUND_UP (l,
+ GET_MODE_SIZE (mode).to_constant ());
if (up > l)
l = up;
align = GET_MODE_ALIGNMENT (mode);
@@ -835,10 +858,11 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
while (max_size > 1 && l > 0)
{
- mode = widest_int_mode_for_size (max_size);
+ mode = widest_int_or_QI_vector_mode_for_size (max_size,
+ op == SET_BY_PIECES);
enum insn_code icode;
- unsigned int modesize = GET_MODE_SIZE (mode);
+ unsigned int modesize = GET_MODE_SIZE (mode).to_constant ();
icode = optab_handler (mov_optab, mode);
if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
@@ -903,8 +927,7 @@ class pieces_addr
void *m_cfndata;
public:
pieces_addr (rtx, bool, by_pieces_constfn, void *);
- rtx adjust (scalar_int_mode, HOST_WIDE_INT,
- by_pieces_prev * = nullptr);
+ rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);
void increment_address (HOST_WIDE_INT);
void maybe_predec (HOST_WIDE_INT);
void maybe_postinc (HOST_WIDE_INT);
@@ -1006,7 +1029,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,
but we still modify the MEM's properties. */
rtx
-pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,
+pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset,
by_pieces_prev *prev)
{
if (m_constfn)
@@ -1060,7 +1083,8 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)
class op_by_pieces_d
{
private:
- scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);
+ machine_mode get_usable_mode (machine_mode, unsigned int);
+ machine_mode smallest_mode_for_size (unsigned int);
protected:
pieces_addr m_to, m_from;
@@ -1073,6 +1097,8 @@ class op_by_pieces_d
bool m_push;
/* True if targetm.overlap_op_by_pieces_p () returns true. */
bool m_overlap_op_by_pieces;
+ /* True if QI vector mode can be used. */
+ bool m_qi_vector_mode;
/* Virtual functions, overriden by derived classes for the specific
operation. */
@@ -1084,7 +1110,8 @@ class op_by_pieces_d
public:
op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
- unsigned HOST_WIDE_INT, unsigned int, bool);
+ unsigned HOST_WIDE_INT, unsigned int, bool,
+ bool = false);
void run ();
};
@@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
by_pieces_constfn from_cfn,
void *from_cfn_data,
unsigned HOST_WIDE_INT len,
- unsigned int align, bool push)
+ unsigned int align, bool push,
+ bool qi_vector_mode)
: m_to (to, to_load, NULL, NULL),
m_from (from, from_load, from_cfn, from_cfn_data),
m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
- m_push (push)
+ m_push (push), m_qi_vector_mode (qi_vector_mode)
{
int toi = m_to.get_addr_inc ();
int fromi = m_from.get_addr_inc ();
@@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)
{
/* Find the mode of the largest comparison. */
- scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+ machine_mode mode
+ = widest_int_or_QI_vector_mode_for_size (m_max_size,
+ m_qi_vector_mode);
m_from.decide_autoinc (mode, m_reverse, len);
m_to.decide_autoinc (mode, m_reverse, len);
@@ -1139,22 +1169,42 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
/* This function returns the largest usable integer mode for LEN bytes
whose size is no bigger than size of MODE. */
-scalar_int_mode
-op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
+machine_mode
+op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len)
{
unsigned int size;
do
{
- size = GET_MODE_SIZE (mode);
+ size = GET_MODE_SIZE (mode).to_constant ();
if (len >= size && prepare_mode (mode, m_align))
break;
- /* NB: widest_int_mode_for_size checks SIZE > 1. */
- mode = widest_int_mode_for_size (size);
+ /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */
+ mode = widest_int_or_QI_vector_mode_for_size (size,
+ m_qi_vector_mode);
}
while (1);
return mode;
}
+/* Return the smallest integer or QI vector mode that is not narrower
+ than SIZE bits. */
+
+machine_mode
+op_by_pieces_d::smallest_mode_for_size (unsigned int size)
+{
+ /* Use QI vector only for > size of WORD. */
+ if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode))
+ {
+ machine_mode mode;
+ FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
+ if (GET_MODE_INNER (mode) == QImode
+ && known_ge (GET_MODE_PRECISION (mode), size))
+ return mode;
+ }
+
+ return smallest_int_mode_for_size (size);
+}
+
/* This function contains the main loop used for expanding a block
operation. First move what we can in the largest integer mode,
then go to successively smaller modes. For every access, call
@@ -1166,8 +1216,10 @@ op_by_pieces_d::run ()
if (m_len == 0)
return;
- /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */
- scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
+ /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1. */
+ machine_mode mode
+ = widest_int_or_QI_vector_mode_for_size (m_max_size,
+ m_qi_vector_mode);
mode = get_usable_mode (mode, m_len);
by_pieces_prev to_prev = { nullptr, mode };
@@ -1175,7 +1227,7 @@ op_by_pieces_d::run ()
do
{
- unsigned int size = GET_MODE_SIZE (mode);
+ unsigned int size = GET_MODE_SIZE (mode).to_constant ();
rtx to1 = NULL_RTX, from1;
while (m_len >= size)
@@ -1214,9 +1266,10 @@ op_by_pieces_d::run ()
/* NB: Generate overlapping operations if it is not a stack
push since stack push must not overlap. Get the smallest
integer mode for M_LEN bytes. */
- mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
- mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
- int gap = GET_MODE_SIZE (mode) - m_len;
+ mode = smallest_mode_for_size (m_len * BITS_PER_UNIT);
+ unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
+ mode = get_usable_mode (mode, mode_size);
+ int gap = mode_size - m_len;
if (gap > 0)
{
/* If size of MODE > M_LEN, generate the last operation
@@ -1231,8 +1284,9 @@ op_by_pieces_d::run ()
}
else
{
- /* NB: widest_int_mode_for_size checks SIZE > 1. */
- mode = widest_int_mode_for_size (size);
+ /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */
+ mode = widest_int_or_QI_vector_mode_for_size (size,
+ m_qi_vector_mode);
mode = get_usable_mode (mode, m_len);
}
}
@@ -1355,9 +1409,10 @@ class store_by_pieces_d : public op_by_pieces_d
public:
store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
- unsigned HOST_WIDE_INT len, unsigned int align)
+ unsigned HOST_WIDE_INT len, unsigned int align,
+ bool qi_vector_mode = false)
: op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
- align, false)
+ align, false, qi_vector_mode)
{
}
rtx finish_retmode (memop_ret);
@@ -1446,13 +1501,14 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
max_size = STORE_MAX_PIECES + 1;
while (max_size > 1 && l > 0)
{
- scalar_int_mode mode = widest_int_mode_for_size (max_size);
+ machine_mode mode
+ = widest_int_or_QI_vector_mode_for_size (max_size, memsetp);
icode = optab_handler (mov_optab, mode);
if (icode != CODE_FOR_nothing
&& align >= GET_MODE_ALIGNMENT (mode))
{
- unsigned int size = GET_MODE_SIZE (mode);
+ unsigned int size = GET_MODE_SIZE (mode).to_constant ();
while (l >= size)
{
@@ -1470,7 +1526,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
}
}
- max_size = GET_MODE_SIZE (mode);
+ max_size = GET_MODE_SIZE (mode).to_constant ();
}
/* The code above should have handled everything. */
@@ -1504,7 +1560,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
optimize_insn_for_speed_p ()));
- store_by_pieces_d data (to, constfun, constfundata, len, align);
+ store_by_pieces_d data (to, constfun, constfundata, len, align,
+ memsetp);
data.run ();
if (retmode != RETURN_BEGIN)
@@ -1517,7 +1574,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
Return const0_rtx unconditionally. */
static rtx
-clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)
+clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode)
{
return const0_rtx;
}
@@ -5754,7 +5811,7 @@ emit_storent_insn (rtx to, rtx from)
static rtx
string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
- scalar_int_mode mode)
+ machine_mode mode)
{
tree str = (tree) data;
@@ -5762,17 +5819,19 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
if (offset >= TREE_STRING_LENGTH (str))
return const0_rtx;
- if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode)
+ unsigned int size = GET_MODE_SIZE (mode).to_constant ();
+ if ((unsigned HOST_WIDE_INT) offset + size
> (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str))
{
- char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
+ char *p = XALLOCAVEC (char, size);
size_t l = TREE_STRING_LENGTH (str) - offset;
memcpy (p, TREE_STRING_POINTER (str) + offset, l);
- memset (p + l, '\0', GET_MODE_SIZE (mode) - l);
- return c_readstr (p, mode, false);
+ memset (p + l, '\0', size - l);
+ return c_readstr (p, as_a <scalar_int_mode> (mode), false);
}
- return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false);
+ return c_readstr (TREE_STRING_POINTER (str) + offset,
+ as_a <scalar_int_mode> (mode), false);
}
/* Generate code for computing expression EXP,
diff --git a/gcc/expr.h b/gcc/expr.h
index a4f44265759..552645e4403 100644
--- a/gcc/expr.h
+++ b/gcc/expr.h
@@ -108,13 +108,13 @@ enum block_op_methods
};
typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT,
- scalar_int_mode);
+ machine_mode);
/* The second pointer passed to by_pieces_constfn. */
struct by_pieces_prev
{
rtx data;
- scalar_int_mode mode;
+ machine_mode mode;
};
extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
diff --git a/gcc/target.def b/gcc/target.def
index 2e40448e6c5..cc4aa3a4212 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -2726,6 +2726,13 @@ DEFHOOK
rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
NULL)
+DEFHOOK
+(gen_memset_scratch_rtx,
+ "This hook should return an rtx for scratch register in @var{mode} to\n\
+be used by memset broadcast. The default is @code{gen_reg_rtx}.",
+ rtx, (machine_mode mode),
+ gen_reg_rtx)
+
/* Return a new value for loop unroll size. */
DEFHOOK
(loop_unroll_adjust,
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
index b6dbcf7809b..007e79f91b0 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
@@ -10,6 +10,6 @@ foo (void)
}
/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
/* { dg-final { scan-assembler-not "vmovdqa" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
index f41e6147b4c..1e50dc842bc 100644
--- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
+++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
@@ -4,6 +4,6 @@
#include "pr100865-4a.c"
/* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
-/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
+/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
/* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
/* { dg-final { scan-assembler-not "vmovdqa" } } */
--
2.31.1
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Add QI vector mode support to by-pieces for memset
2021-07-16 23:57 ` [PATCH] Add QI vector mode support to by-pieces for memset H.J. Lu
@ 2021-07-19 14:40 ` Richard Sandiford
2021-07-20 2:43 ` H.J. Lu
0 siblings, 1 reply; 14+ messages in thread
From: Richard Sandiford @ 2021-07-19 14:40 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches; +Cc: Jeff Law, H.J. Lu
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Fri, Jul 16, 2021 at 7:15 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>>
>> On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>> >
>> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
>> > > <richard.sandiford@arm.com> wrote:
>> > >>
>> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
>> > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
>> > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
>> > >> > scratch register to avoid stack realignment when expanding memset.
>> > >> >
>> > >> > PR middle-end/90773
>> > >> > * builtins.c (gen_memset_value_from_prev): New function.
>> > >> > (gen_memset_broadcast): Likewise.
>> > >> > (builtin_memset_read_str): Use gen_memset_value_from_prev
>> > >> > and gen_memset_broadcast.
>> > >> > (builtin_memset_gen_str): Likewise.
>> > >> > * target.def (gen_memset_scratch_rtx): New hook.
>> > >> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
>> > >> > * doc/tm.texi: Regenerated.
>> > >> > ---
>> > >> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++--------
>> > >> > gcc/doc/tm.texi | 5 ++
>> > >> > gcc/doc/tm.texi.in | 2 +
>> > >> > gcc/target.def | 7 +++
>> > >> > 4 files changed, 116 insertions(+), 21 deletions(-)
>> > >> >
>> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c
>> > >> > index 39ab139b7e1..c1758ae2efc 100644
>> > >> > --- a/gcc/builtins.c
>> > >> > +++ b/gcc/builtins.c
>> > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
>> > >> > return NULL_RTX;
>> > >> > }
>> > >> >
>> > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
>> > >> > - bytes from constant string DATA + OFFSET and return it as target
>> > >> > - constant. If PREV isn't nullptr, it has the RTL info from the
>> > >> > +/* Return the RTL of a register in MODE generated from PREV in the
>> > >> > previous iteration. */
>> > >> >
>> > >> > -rtx
>> > >> > -builtin_memset_read_str (void *data, void *prevp,
>> > >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
>> > >> > - scalar_int_mode mode)
>> > >> > +static rtx
>> > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
>> > >> > {
>> > >> > + rtx target = nullptr;
>> > >> > by_pieces_prev *prev = (by_pieces_prev *) prevp;
>> > >> > if (prev != nullptr && prev->data != nullptr)
>> > >> > {
>> > >> > /* Use the previous data in the same mode. */
>> > >> > if (prev->mode == mode)
>> > >> > return prev->data;
>> > >> > +
>> > >> > + rtx prev_rtx = prev->data;
>> > >> > + machine_mode prev_mode = prev->mode;
>> > >> > + unsigned int word_size = GET_MODE_SIZE (word_mode);
>> > >> > + if (word_size < GET_MODE_SIZE (prev->mode)
>> > >> > + && word_size > GET_MODE_SIZE (mode))
>> > >> > + {
>> > >> > + /* First generate subreg of word mode if the previous mode is
>> > >> > + wider than word mode and word mode is wider than MODE. */
>> > >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
>> > >> > + prev_mode, 0);
>> > >> > + prev_mode = word_mode;
>> > >> > + }
>> > >> > + if (prev_rtx != nullptr)
>> > >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
>> > >> > }
>> > >> > + return target;
>> > >> > +}
>> > >> > +
>> > >> > +/* Return the RTL of a register in MODE broadcasted from DATA. */
>> > >> > +
>> > >> > +static rtx
>> > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
>> > >> > +{
>> > >> > + /* Skip if regno_reg_rtx isn't initialized. */
>> > >> > + if (!regno_reg_rtx)
>> > >> > + return nullptr;
>> > >> > +
>> > >> > + rtx target = nullptr;
>> > >> > +
>> > >> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
>> > >> > + machine_mode vector_mode;
>> > >> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
>> > >> > + gcc_unreachable ();
>> > >>
>> > >> Sorry, I realise it's a bit late to be raising this objection now,
>> > >> but I don't think it's a good idea to use scalar integer modes as
>> > >> a proxy for vector modes. In principle there's no reason why a
>> > >> target has to define an integer mode for every vector mode.
>> > >
>> > > A target always defines the largest integer mode.
>> >
>> > Right. But a target shouldn't *need* to define an integer mode
>> > for every vector mode.
>> >
>> > >> If we want the mode to be a vector then I think the by-pieces
>> > >> infrastructure should be extended to support vectors directly,
>> > >> rather than assuming that each piece can be represented as
>> > >> a scalar_int_mode.
>> > >>
>> > >
>> > > The current by-pieces infrastructure operates on scalar_int_mode.
>> > > Only for memset, there is
>> > >
>> > > /* Callback routine for store_by_pieces. Return the RTL of a register
>> > > containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
>> > > char value given in the RTL register data. For example, if mode is
>> > > 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't
>> > > nullptr, it has the RTL info from the previous iteration. */
>> > >
>> > > static rtx
>> > > builtin_memset_gen_str (void *data, void *prevp,
>> > > HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
>> > > scalar_int_mode mode)
>> > >
>> > > It is a broadcast. If a target can broadcast a byte to a wider integer,
>> > > can you suggest a way to use it in the current by-pieces infrastructure?
>> >
>> > I meant that we should extend the by-pieces infrastructure so that
>> > it no longer requires scalar_int_mode, rather than work within the
>> > current limits.
>> >
>> > IMO the fact that we're (legitimately) going through vec_duplicate_optab
>>
>> If vec_duplicate_optab is the only blocking issue, can we add an integer
>> broadcast optab? I want to avoid a major surgery just because we
>> don't support integer broadcast from byte.
>>
>> > shows that we really are dealing with vectors here, not integers.
>> >
>>
>> Are you suggesting we should add the optional vector mode
>> support to by-pieces for memset?
>>
>
> Like this?
>
> 1. Replace scalar_int_mode with machine_mode in the by-pieces
> infrastructure to allow non-integer mode.
I guess it might be better to use fixed_size_mode instead of
machine_mode.
> 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
> to return QI vector mode for memset.
> 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
> integer or QI vector mode.
>
> Thanks.
>
> --
> H.J.
>
> From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" <hjl.tools@gmail.com>
> Date: Sun, 6 Mar 2016 06:38:21 -0800
> Subject: [PATCH] Add QI vector mode support to by-pieces for memset
>
> 1. Replace scalar_int_mode with machine_mode in the by-pieces
> infrastructure to allow non-integer mode.
> 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
> to return QI vector mode for memset.
> 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
> integer or QI vector mode.
>
> gcc/
>
> PR middle-end/90773
> * builtins.c (builtin_memcpy_read_str): Change the mode argument
> from scalar_int_mode to machine_mode.
> (builtin_strncpy_read_str): Likewise.
> (gen_memset_value_from_prev): New function.
> (gen_memset_broadcast): Likewise.
> (builtin_memset_read_str): Change the mode argument from
> scalar_int_mode to machine_mode. Use gen_memset_value_from_prev
> and gen_memset_broadcast.
> (builtin_memset_gen_str): Likewise.
> (try_store_by_multiple_pieces): Use by_pieces_constfn to declare
> constfun.
> * builtins.h (builtin_strncpy_read_str): Updated.
> (builtin_memset_read_str): Likewise.
> * expr.c (widest_int_mode_for_size): Renamed to ...
> (widest_int_or_QI_vector_mode_for_size): Add a bool argument to
> indicate if QI vector mode can be used.
> (by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size
> instead of widest_int_mode_for_size.
> (pieces_addr::adjust): Change the mode argument from
> scalar_int_mode to machine_mode.
> (op_by_pieces_d): Add a bool member, m_qi_vector_mode, to
> indicate that QI vector mode can be used.
> (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize
> m_qi_vector_mode. Call widest_int_or_QI_vector_mode_for_size
> instead of widest_int_mode_for_size.
> (op_by_pieces_d::get_usable_mode): Change the mode argument from
> scalar_int_mode to machine_mode. Call
> widest_int_or_QI_vector_mode_for_size instead of
> widest_int_mode_for_size.
> (op_by_pieces_d::smallest_mode_for_size): New member function
> to return the smallest integer or QI vector mode.
> (op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size
> instead of widest_int_mode_for_size. Call smallest_mode_for_size
> instead of smallest_int_mode_for_size.
> (store_by_pieces_d::store_by_pieces_d): Add a bool argument to
> indicate that QI vector mode can be used and pass it to
> op_by_pieces_d::op_by_pieces_d.
> (can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size
> instead of widest_int_mode_for_size.
> (store_by_pieces::store_by_pieces): Pass memsetp to
> store_by_pieces_d::store_by_pieces_d.
> (clear_by_pieces_1): Change scalar_int_mode to machine_mode.
> (string_cst_read_str): Change the mode argument from
> scalar_int_mode to machine_mode.
> * expr.h (by_pieces_constfn): Change scalar_int_mode to
> machine_mode.
> (by_pieces_prev): Likewise.
> * target.def (gen_memset_scratch_rtx): New hook.
> * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> * doc/tm.texi: Regenerated.
>
> gcc/testsuite/
>
> * gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of
> vmovdqu.
> * gcc.target/i386/pr100865-4b.c: Likewise.
> ---
> gcc/builtins.c | 138 ++++++++++++++-----
> gcc/builtins.h | 4 +-
> gcc/doc/tm.texi | 5 +
> gcc/doc/tm.texi.in | 2 +
> gcc/expr.c | 145 ++++++++++++++------
> gcc/expr.h | 4 +-
> gcc/target.def | 7 +
> gcc/testsuite/gcc.target/i386/pr100865-3.c | 2 +-
> gcc/testsuite/gcc.target/i386/pr100865-4b.c | 2 +-
> 9 files changed, 229 insertions(+), 80 deletions(-)
>
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index 39ab139b7e1..876378b467c 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -3890,13 +3890,14 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)
>
> static rtx
> builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> - scalar_int_mode mode)
> + machine_mode mode)
> {
> /* The REPresentation pointed to by DATA need not be a nul-terminated
> string but the caller guarantees it's large enough for MODE. */
> const char *rep = (const char *) data;
>
> - return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
> + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
> + /*nul_terminated=*/false);
> }
>
> /* LEN specify length of the block of memcpy/memset operation.
> @@ -6478,14 +6479,14 @@ expand_builtin_stpncpy (tree exp, rtx)
>
> rtx
> builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> - scalar_int_mode mode)
> + machine_mode mode)
> {
> const char *str = (const char *) data;
>
> if ((unsigned HOST_WIDE_INT) offset > strlen (str))
> return const0_rtx;
>
> - return c_readstr (str + offset, mode);
> + return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
> }
> /* Helper to check the sizes of sequences and the destination of calls
I think both of these as_a<scalar_int_mode>s should have a comment
explaining that we don't (yet?) consider using vector modes for this
operation.
> @@ -6686,30 +6687,109 @@ expand_builtin_strncpy (tree exp, rtx target)
> return NULL_RTX;
> }
>
> -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> - bytes from constant string DATA + OFFSET and return it as target
> - constant. If PREV isn't nullptr, it has the RTL info from the
> +/* Return the RTL of a register in MODE generated from PREV in the
> previous iteration. */
>
> -rtx
> -builtin_memset_read_str (void *data, void *prevp,
> - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> - scalar_int_mode mode)
> +static rtx
> +gen_memset_value_from_prev (void *prevp, machine_mode mode)
> {
> + rtx target = nullptr;
> by_pieces_prev *prev = (by_pieces_prev *) prevp;
I think it'd be better to pass the proper type instead, since this
function isn't directly implementing a callback interface.
> if (prev != nullptr && prev->data != nullptr)
> {
> /* Use the previous data in the same mode. */
> if (prev->mode == mode)
> return prev->data;
> +
> + rtx prev_rtx = prev->data;
> + machine_mode prev_mode = prev->mode;
> + unsigned int word_size = GET_MODE_SIZE (word_mode);
> + if (word_size < GET_MODE_SIZE (prev->mode).to_constant ()
> + && word_size > GET_MODE_SIZE (mode).to_constant ())
Using fixed_size_mode would also avoid the to_constants here.
UNITS_PER_WORD is more direct/canonical than GET_MODE_SIZE (word_mode).
> + {
> + /* First generate subreg of word mode if the previous mode is
> + wider than word mode and word mode is wider than MODE. */
> + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> + prev_mode, 0);
> + prev_mode = word_mode;
> + }
> + if (prev_rtx != nullptr)
> + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
This should be lowpart_subreg, since 0 isn't the right offset for
big-endian targets. Using lowpart_subreg should also avoid the need
for the word_size “if” above: lowpart_subreg can handle lowpart subword
subregs of multiword values.
> + }
> + return target;
> +}
> +
> +/* Return the RTL of a register in MODE broadcasted from DATA. */
> +
> +static rtx
> +gen_memset_broadcast (rtx data, machine_mode mode)
> +{
> + /* Skip if regno_reg_rtx isn't initialized or not vector mode. */
Why's the regno_reg_rtx condition needed?
> + if (!regno_reg_rtx || !VECTOR_MODE_P (mode))
> + return nullptr;
> +
> + gcc_assert (GET_MODE_INNER (mode) == QImode);
> +
> + enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
> +
> + gcc_assert (icode != CODE_FOR_nothing);
Might be worth a comment here saying that having vec_duplicate_optab is
a precondition for picking a vector mode.
> +
> + rtx target = targetm.gen_memset_scratch_rtx (mode);
> + if (CONST_INT_P (data))
> + {
> + /* Use the move expander with CONST_VECTOR. */
> + rtx const_vec = gen_const_vec_duplicate (mode, data);
> + emit_move_insn (target, const_vec);
> + }
> + else
> + {
> +
Nit: excess blank line.
> + class expand_operand ops[2];
> + create_output_operand (&ops[0], target, mode);
> + create_input_operand (&ops[1], data, QImode);
> + expand_insn (icode, 2, ops);
> + if (!rtx_equal_p (target, ops[0].value))
> + emit_move_insn (target, ops[0].value);
> }
>
> + return target;
> +}
> +
> +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> + bytes from constant string DATA + OFFSET and return it as target
> + constant. If PREV isn't nullptr, it has the RTL info from the
> + previous iteration. */
> +
> +rtx
> +builtin_memset_read_str (void *data, void *prev,
> + HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> + machine_mode mode)
> +{
> + rtx target;
> const char *c = (const char *) data;
> - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> + char *p;
> + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> +
> + /* Don't use the previous value if size is 1. */
Why not though? The existing code does, and that seems like the right
thing to do when operating on integers.
I can see the check would be a good thing to do if MODE isn't a vector
mode and the previous mode was. Is that the case you're worried about?
If so, that check could go in gen_memset_value_from_prev instead.
> + if (size != 1)
> + {
> + target = gen_memset_value_from_prev (prev, mode);
> + if (target != nullptr)
> + return target;
> +
> + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
> + memset (p, *c, GET_MODE_SIZE (QImode));
> + rtx src = c_readstr (p, QImode);
This seems unnecessarily complicated. Couldn't it just be
gen_int_mode (QImode, *c);
instead? There are no endianness concerns for single QI values.
> + target = gen_memset_broadcast (src, mode);
> + if (target != nullptr)
> + return target;
> + }
> +
> + p = XALLOCAVEC (char, size);
>
> - memset (p, *c, GET_MODE_SIZE (mode));
> + memset (p, *c, size);
>
> - return c_readstr (p, mode);
> + return c_readstr (p, as_a <scalar_int_mode> (mode));
> }
>
> /* Callback routine for store_by_pieces. Return the RTL of a register
> @@ -6719,33 +6799,29 @@ builtin_memset_read_str (void *data, void *prevp,
> nullptr, it has the RTL info from the previous iteration. */
>
> static rtx
> -builtin_memset_gen_str (void *data, void *prevp,
> +builtin_memset_gen_str (void *data, void *prev,
> HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> - scalar_int_mode mode)
> + machine_mode mode)
> {
> rtx target, coeff;
> size_t size;
> char *p;
>
> - by_pieces_prev *prev = (by_pieces_prev *) prevp;
> - if (prev != nullptr && prev->data != nullptr)
> - {
> - /* Use the previous data in the same mode. */
> - if (prev->mode == mode)
> - return prev->data;
> -
> - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> - if (target != nullptr)
> - return target;
> - }
> -
> - size = GET_MODE_SIZE (mode);
> + size = GET_MODE_SIZE (mode).to_constant ();
> if (size == 1)
> return (rtx) data;
>
> + target = gen_memset_value_from_prev (prev, mode);
> + if (target != nullptr)
> + return target;
> +
> + target = gen_memset_broadcast ((rtx) data, mode);
> + if (target != nullptr)
> + return target;
> +
> p = XALLOCAVEC (char, size);
> memset (p, 1, size);
> - coeff = c_readstr (p, mode);
> + coeff = c_readstr (p, as_a <scalar_int_mode> (mode));
>
> target = convert_to_mode (mode, (rtx) data, 1);
> target = expand_mult (mode, target, coeff, NULL_RTX, 1);
> @@ -6849,7 +6925,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
> &valc, align, true))
> return false;
>
> - rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);
> + by_pieces_constfn constfun;
> void *constfundata;
> if (val)
> {
> diff --git a/gcc/builtins.h b/gcc/builtins.h
> index a64ece3f1cd..31d07621750 100644
> --- a/gcc/builtins.h
> +++ b/gcc/builtins.h
> @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn);
> extern tree mathfn_built_in (tree, combined_fn);
> extern tree mathfn_built_in_type (combined_fn);
> extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
> - scalar_int_mode);
> + machine_mode);
> extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
> - scalar_int_mode);
> + machine_mode);
> extern rtx expand_builtin_saveregs (void);
> extern tree std_build_builtin_va_list (void);
> extern tree std_fn_abi_va_list (tree);
> diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> index 3ad39443eba..c1995270720 100644
> --- a/gcc/doc/tm.texi
> +++ b/gcc/doc/tm.texi
> @@ -12123,6 +12123,11 @@ This function prepares to emit a conditional comparison within a sequence
> @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
> @end deftypefn
>
> +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
> +This hook should return an rtx for scratch register in @var{mode} to
> +be used by memset broadcast. The default is @code{gen_reg_rtx}.
This should give a hint why the default might not be appropriate.
> +@end deftypefn
> +
> @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
> This target hook returns a new value for the number of times @var{loop}
> should be unrolled. The parameter @var{nunroll} is the number of times
> diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> index f881cdabe9e..a6bbf4f2667 100644
> --- a/gcc/doc/tm.texi.in
> +++ b/gcc/doc/tm.texi.in
> @@ -7958,6 +7958,8 @@ lists.
>
> @hook TARGET_GEN_CCMP_NEXT
>
> +@hook TARGET_GEN_MEMSET_SCRATCH_RTX
> +
> @hook TARGET_LOOP_UNROLL_ADJUST
>
> @defmac POWI_MAX_MULTS
> diff --git a/gcc/expr.c b/gcc/expr.c
> index 6a4368113c4..92dbf47194e 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
> return align;
> }
>
> -/* Return the widest integer mode that is narrower than SIZE bytes. */
> +/* Return the widest integer or QI vector mode that is narrower than
> + SIZE bytes. */
>
> -static scalar_int_mode
> -widest_int_mode_for_size (unsigned int size)
> +static machine_mode
> +widest_int_or_QI_vector_mode_for_size (unsigned int size,
> + bool qi_vector)
> {
> - scalar_int_mode result = NARROWEST_INT_MODE;
> + machine_mode result = NARROWEST_INT_MODE;
>
> gcc_checking_assert (size > 1);
>
> + /* Use QI vector only if size is wider than a WORD. */
> + if (qi_vector && size > GET_MODE_SIZE (word_mode))
> + {
> + machine_mode mode;
> + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> + if (GET_MODE_INNER (mode) == QImode
> + && GET_MODE_SIZE (mode).is_constant ())
> + {
> + if (GET_MODE_SIZE (mode).to_constant () >= size)
> + break;
> + if (optab_handler (vec_duplicate_optab, mode)
> + != CODE_FOR_nothing)
> + result = mode;
> + }
> +
> + if (result != NARROWEST_INT_MODE)
> + return result;
> + }
> +
> opt_scalar_int_mode tmode;
> FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
> if (GET_MODE_SIZE (tmode.require ()) < size)
> @@ -815,16 +836,18 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
> unsigned int max_size, by_pieces_operation op)
> {
> unsigned HOST_WIDE_INT n_insns = 0;
> - scalar_int_mode mode;
> + machine_mode mode;
>
> if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
> {
> /* NB: Round up L and ALIGN to the widest integer mode for
> MAX_SIZE. */
> - mode = widest_int_mode_for_size (max_size);
> + mode = widest_int_or_QI_vector_mode_for_size (max_size,
> + op == SET_BY_PIECES);
> if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
> {
> - unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
> + unsigned HOST_WIDE_INT up = ROUND_UP (l,
> + GET_MODE_SIZE (mode).to_constant ());
> if (up > l)
> l = up;
> align = GET_MODE_ALIGNMENT (mode);
> @@ -835,10 +858,11 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
>
> while (max_size > 1 && l > 0)
> {
> - mode = widest_int_mode_for_size (max_size);
> + mode = widest_int_or_QI_vector_mode_for_size (max_size,
> + op == SET_BY_PIECES);
> enum insn_code icode;
>
> - unsigned int modesize = GET_MODE_SIZE (mode);
> + unsigned int modesize = GET_MODE_SIZE (mode).to_constant ();
>
> icode = optab_handler (mov_optab, mode);
> if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
> @@ -903,8 +927,7 @@ class pieces_addr
> void *m_cfndata;
> public:
> pieces_addr (rtx, bool, by_pieces_constfn, void *);
> - rtx adjust (scalar_int_mode, HOST_WIDE_INT,
> - by_pieces_prev * = nullptr);
> + rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);
> void increment_address (HOST_WIDE_INT);
> void maybe_predec (HOST_WIDE_INT);
> void maybe_postinc (HOST_WIDE_INT);
> @@ -1006,7 +1029,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,
> but we still modify the MEM's properties. */
>
> rtx
> -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,
> +pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset,
> by_pieces_prev *prev)
> {
> if (m_constfn)
> @@ -1060,7 +1083,8 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)
> class op_by_pieces_d
> {
> private:
> - scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);
> + machine_mode get_usable_mode (machine_mode, unsigned int);
> + machine_mode smallest_mode_for_size (unsigned int);
>
> protected:
> pieces_addr m_to, m_from;
> @@ -1073,6 +1097,8 @@ class op_by_pieces_d
> bool m_push;
> /* True if targetm.overlap_op_by_pieces_p () returns true. */
> bool m_overlap_op_by_pieces;
> + /* True if QI vector mode can be used. */
> + bool m_qi_vector_mode;
>
> /* Virtual functions, overriden by derived classes for the specific
> operation. */
> @@ -1084,7 +1110,8 @@ class op_by_pieces_d
>
> public:
> op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
> - unsigned HOST_WIDE_INT, unsigned int, bool);
> + unsigned HOST_WIDE_INT, unsigned int, bool,
> + bool = false);
> void run ();
> };
>
> @@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> by_pieces_constfn from_cfn,
> void *from_cfn_data,
> unsigned HOST_WIDE_INT len,
> - unsigned int align, bool push)
> + unsigned int align, bool push,
> + bool qi_vector_mode)
> : m_to (to, to_load, NULL, NULL),
> m_from (from, from_load, from_cfn, from_cfn_data),
> m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
> - m_push (push)
> + m_push (push), m_qi_vector_mode (qi_vector_mode)
> {
> int toi = m_to.get_addr_inc ();
> int fromi = m_from.get_addr_inc ();
> @@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)
> {
> /* Find the mode of the largest comparison. */
> - scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
> + machine_mode mode
> + = widest_int_or_QI_vector_mode_for_size (m_max_size,
> + m_qi_vector_mode);
>
> m_from.decide_autoinc (mode, m_reverse, len);
> m_to.decide_autoinc (mode, m_reverse, len);
> @@ -1139,22 +1169,42 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> /* This function returns the largest usable integer mode for LEN bytes
> whose size is no bigger than size of MODE. */
>
> -scalar_int_mode
> -op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
> +machine_mode
> +op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len)
> {
> unsigned int size;
> do
> {
> - size = GET_MODE_SIZE (mode);
> + size = GET_MODE_SIZE (mode).to_constant ();
> if (len >= size && prepare_mode (mode, m_align))
> break;
> - /* NB: widest_int_mode_for_size checks SIZE > 1. */
> - mode = widest_int_mode_for_size (size);
> + /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */
> + mode = widest_int_or_QI_vector_mode_for_size (size,
> + m_qi_vector_mode);
> }
> while (1);
> return mode;
> }
>
> +/* Return the smallest integer or QI vector mode that is not narrower
> + than SIZE bits. */
> +
> +machine_mode
> +op_by_pieces_d::smallest_mode_for_size (unsigned int size)
> +{
> + /* Use QI vector only for > size of WORD. */
> + if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode))
> + {
> + machine_mode mode;
> + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> + if (GET_MODE_INNER (mode) == QImode
> + && known_ge (GET_MODE_PRECISION (mode), size))
> + return mode;
Shouldn't this also test for vec_duplicate_optab? I guess you're
hoping that the previous register will be reused via a lowpart,
but since that's up to the callbacks, I think we should be
conservative here.
Alternatively, we could bypass the callbacks when this code is used.
Thanks,
Richard
> + }
> +
> + return smallest_int_mode_for_size (size);
> +}
> +
> /* This function contains the main loop used for expanding a block
> operation. First move what we can in the largest integer mode,
> then go to successively smaller modes. For every access, call
> @@ -1166,8 +1216,10 @@ op_by_pieces_d::run ()
> if (m_len == 0)
> return;
>
> - /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */
> - scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
> + /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1. */
> + machine_mode mode
> + = widest_int_or_QI_vector_mode_for_size (m_max_size,
> + m_qi_vector_mode);
> mode = get_usable_mode (mode, m_len);
>
> by_pieces_prev to_prev = { nullptr, mode };
> @@ -1175,7 +1227,7 @@ op_by_pieces_d::run ()
>
> do
> {
> - unsigned int size = GET_MODE_SIZE (mode);
> + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> rtx to1 = NULL_RTX, from1;
>
> while (m_len >= size)
> @@ -1214,9 +1266,10 @@ op_by_pieces_d::run ()
> /* NB: Generate overlapping operations if it is not a stack
> push since stack push must not overlap. Get the smallest
> integer mode for M_LEN bytes. */
> - mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
> - mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
> - int gap = GET_MODE_SIZE (mode) - m_len;
> + mode = smallest_mode_for_size (m_len * BITS_PER_UNIT);
> + unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
> + mode = get_usable_mode (mode, mode_size);
> + int gap = mode_size - m_len;
> if (gap > 0)
> {
> /* If size of MODE > M_LEN, generate the last operation
> @@ -1231,8 +1284,9 @@ op_by_pieces_d::run ()
> }
> else
> {
> - /* NB: widest_int_mode_for_size checks SIZE > 1. */
> - mode = widest_int_mode_for_size (size);
> + /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */
> + mode = widest_int_or_QI_vector_mode_for_size (size,
> + m_qi_vector_mode);
> mode = get_usable_mode (mode, m_len);
> }
> }
> @@ -1355,9 +1409,10 @@ class store_by_pieces_d : public op_by_pieces_d
>
> public:
> store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
> - unsigned HOST_WIDE_INT len, unsigned int align)
> + unsigned HOST_WIDE_INT len, unsigned int align,
> + bool qi_vector_mode = false)
> : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
> - align, false)
> + align, false, qi_vector_mode)
> {
> }
> rtx finish_retmode (memop_ret);
> @@ -1446,13 +1501,14 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
> max_size = STORE_MAX_PIECES + 1;
> while (max_size > 1 && l > 0)
> {
> - scalar_int_mode mode = widest_int_mode_for_size (max_size);
> + machine_mode mode
> + = widest_int_or_QI_vector_mode_for_size (max_size, memsetp);
>
> icode = optab_handler (mov_optab, mode);
> if (icode != CODE_FOR_nothing
> && align >= GET_MODE_ALIGNMENT (mode))
> {
> - unsigned int size = GET_MODE_SIZE (mode);
> + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
>
> while (l >= size)
> {
> @@ -1470,7 +1526,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
> }
> }
>
> - max_size = GET_MODE_SIZE (mode);
> + max_size = GET_MODE_SIZE (mode).to_constant ();
> }
>
> /* The code above should have handled everything. */
> @@ -1504,7 +1560,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
> memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
> optimize_insn_for_speed_p ()));
>
> - store_by_pieces_d data (to, constfun, constfundata, len, align);
> + store_by_pieces_d data (to, constfun, constfundata, len, align,
> + memsetp);
> data.run ();
>
> if (retmode != RETURN_BEGIN)
> @@ -1517,7 +1574,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
> Return const0_rtx unconditionally. */
>
> static rtx
> -clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)
> +clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode)
> {
> return const0_rtx;
> }
> @@ -5754,7 +5811,7 @@ emit_storent_insn (rtx to, rtx from)
>
> static rtx
> string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
> - scalar_int_mode mode)
> + machine_mode mode)
> {
> tree str = (tree) data;
>
> @@ -5762,17 +5819,19 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
> if (offset >= TREE_STRING_LENGTH (str))
> return const0_rtx;
>
> - if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode)
> + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> + if ((unsigned HOST_WIDE_INT) offset + size
> > (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str))
> {
> - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> + char *p = XALLOCAVEC (char, size);
> size_t l = TREE_STRING_LENGTH (str) - offset;
> memcpy (p, TREE_STRING_POINTER (str) + offset, l);
> - memset (p + l, '\0', GET_MODE_SIZE (mode) - l);
> - return c_readstr (p, mode, false);
> + memset (p + l, '\0', size - l);
> + return c_readstr (p, as_a <scalar_int_mode> (mode), false);
> }
>
> - return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false);
> + return c_readstr (TREE_STRING_POINTER (str) + offset,
> + as_a <scalar_int_mode> (mode), false);
> }
>
> /* Generate code for computing expression EXP,
> diff --git a/gcc/expr.h b/gcc/expr.h
> index a4f44265759..552645e4403 100644
> --- a/gcc/expr.h
> +++ b/gcc/expr.h
> @@ -108,13 +108,13 @@ enum block_op_methods
> };
>
> typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT,
> - scalar_int_mode);
> + machine_mode);
>
> /* The second pointer passed to by_pieces_constfn. */
> struct by_pieces_prev
> {
> rtx data;
> - scalar_int_mode mode;
> + machine_mode mode;
> };
>
> extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
> diff --git a/gcc/target.def b/gcc/target.def
> index 2e40448e6c5..cc4aa3a4212 100644
> --- a/gcc/target.def
> +++ b/gcc/target.def
> @@ -2726,6 +2726,13 @@ DEFHOOK
> rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
> NULL)
>
> +DEFHOOK
> +(gen_memset_scratch_rtx,
> + "This hook should return an rtx for scratch register in @var{mode} to\n\
> +be used by memset broadcast. The default is @code{gen_reg_rtx}.",
> + rtx, (machine_mode mode),
> + gen_reg_rtx)
> +
> /* Return a new value for loop unroll size. */
> DEFHOOK
> (loop_unroll_adjust,
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> index b6dbcf7809b..007e79f91b0 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> @@ -10,6 +10,6 @@ foo (void)
> }
>
> /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
> +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
> /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
> /* { dg-final { scan-assembler-not "vmovdqa" } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> index f41e6147b4c..1e50dc842bc 100644
> --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> @@ -4,6 +4,6 @@
> #include "pr100865-4a.c"
>
> /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
> +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
> /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
> /* { dg-final { scan-assembler-not "vmovdqa" } } */
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Add QI vector mode support to by-pieces for memset
2021-07-19 14:40 ` Richard Sandiford
@ 2021-07-20 2:43 ` H.J. Lu
2021-07-20 6:38 ` Richard Sandiford
0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu @ 2021-07-20 2:43 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches, Jeff Law, H.J. Lu, Richard Sandiford
On Mon, Jul 19, 2021 at 7:41 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > On Fri, Jul 16, 2021 at 7:15 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >>
> >> On Fri, Jul 16, 2021 at 6:24 AM Richard Sandiford
> >> <richard.sandiford@arm.com> wrote:
> >> >
> >> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > > On Fri, Jul 16, 2021 at 4:38 AM Richard Sandiford
> >> > > <richard.sandiford@arm.com> wrote:
> >> > >>
> >> > >> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > >> > 1. Rewrite builtin_memset_read_str and builtin_memset_gen_str with
> >> > >> > vec_duplicate_optab to duplicate QI value to TI/OI/XI value.
> >> > >> > 2. Add TARGET_GEN_MEMSET_SCRATCH_RTX to allow the backend to use a hard
> >> > >> > scratch register to avoid stack realignment when expanding memset.
> >> > >> >
> >> > >> > PR middle-end/90773
> >> > >> > * builtins.c (gen_memset_value_from_prev): New function.
> >> > >> > (gen_memset_broadcast): Likewise.
> >> > >> > (builtin_memset_read_str): Use gen_memset_value_from_prev
> >> > >> > and gen_memset_broadcast.
> >> > >> > (builtin_memset_gen_str): Likewise.
> >> > >> > * target.def (gen_memset_scratch_rtx): New hook.
> >> > >> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> >> > >> > * doc/tm.texi: Regenerated.
> >> > >> > ---
> >> > >> > gcc/builtins.c | 123 +++++++++++++++++++++++++++++++++++++--------
> >> > >> > gcc/doc/tm.texi | 5 ++
> >> > >> > gcc/doc/tm.texi.in | 2 +
> >> > >> > gcc/target.def | 7 +++
> >> > >> > 4 files changed, 116 insertions(+), 21 deletions(-)
> >> > >> >
> >> > >> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> >> > >> > index 39ab139b7e1..c1758ae2efc 100644
> >> > >> > --- a/gcc/builtins.c
> >> > >> > +++ b/gcc/builtins.c
> >> > >> > @@ -6686,26 +6686,111 @@ expand_builtin_strncpy (tree exp, rtx target)
> >> > >> > return NULL_RTX;
> >> > >> > }
> >> > >> >
> >> > >> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> >> > >> > - bytes from constant string DATA + OFFSET and return it as target
> >> > >> > - constant. If PREV isn't nullptr, it has the RTL info from the
> >> > >> > +/* Return the RTL of a register in MODE generated from PREV in the
> >> > >> > previous iteration. */
> >> > >> >
> >> > >> > -rtx
> >> > >> > -builtin_memset_read_str (void *data, void *prevp,
> >> > >> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >> > >> > - scalar_int_mode mode)
> >> > >> > +static rtx
> >> > >> > +gen_memset_value_from_prev (void *prevp, scalar_int_mode mode)
> >> > >> > {
> >> > >> > + rtx target = nullptr;
> >> > >> > by_pieces_prev *prev = (by_pieces_prev *) prevp;
> >> > >> > if (prev != nullptr && prev->data != nullptr)
> >> > >> > {
> >> > >> > /* Use the previous data in the same mode. */
> >> > >> > if (prev->mode == mode)
> >> > >> > return prev->data;
> >> > >> > +
> >> > >> > + rtx prev_rtx = prev->data;
> >> > >> > + machine_mode prev_mode = prev->mode;
> >> > >> > + unsigned int word_size = GET_MODE_SIZE (word_mode);
> >> > >> > + if (word_size < GET_MODE_SIZE (prev->mode)
> >> > >> > + && word_size > GET_MODE_SIZE (mode))
> >> > >> > + {
> >> > >> > + /* First generate subreg of word mode if the previous mode is
> >> > >> > + wider than word mode and word mode is wider than MODE. */
> >> > >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> >> > >> > + prev_mode, 0);
> >> > >> > + prev_mode = word_mode;
> >> > >> > + }
> >> > >> > + if (prev_rtx != nullptr)
> >> > >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> >> > >> > }
> >> > >> > + return target;
> >> > >> > +}
> >> > >> > +
> >> > >> > +/* Return the RTL of a register in MODE broadcasted from DATA. */
> >> > >> > +
> >> > >> > +static rtx
> >> > >> > +gen_memset_broadcast (rtx data, scalar_int_mode mode)
> >> > >> > +{
> >> > >> > + /* Skip if regno_reg_rtx isn't initialized. */
> >> > >> > + if (!regno_reg_rtx)
> >> > >> > + return nullptr;
> >> > >> > +
> >> > >> > + rtx target = nullptr;
> >> > >> > +
> >> > >> > + unsigned int nunits = GET_MODE_SIZE (mode) / GET_MODE_SIZE (QImode);
> >> > >> > + machine_mode vector_mode;
> >> > >> > + if (!mode_for_vector (QImode, nunits).exists (&vector_mode))
> >> > >> > + gcc_unreachable ();
> >> > >>
> >> > >> Sorry, I realise it's a bit late to be raising this objection now,
> >> > >> but I don't think it's a good idea to use scalar integer modes as
> >> > >> a proxy for vector modes. In principle there's no reason why a
> >> > >> target has to define an integer mode for every vector mode.
> >> > >
> >> > > A target always defines the largest integer mode.
> >> >
> >> > Right. But a target shouldn't *need* to define an integer mode
> >> > for every vector mode.
> >> >
> >> > >> If we want the mode to be a vector then I think the by-pieces
> >> > >> infrastructure should be extended to support vectors directly,
> >> > >> rather than assuming that each piece can be represented as
> >> > >> a scalar_int_mode.
> >> > >>
> >> > >
> >> > > The current by-pieces infrastructure operates on scalar_int_mode.
> >> > > Only for memset, there is
> >> > >
> >> > > /* Callback routine for store_by_pieces. Return the RTL of a register
> >> > > containing GET_MODE_SIZE (MODE) consecutive copies of the unsigned
> >> > > char value given in the RTL register data. For example, if mode is
> >> > > 4 bytes wide, return the RTL for 0x01010101*data. If PREV isn't
> >> > > nullptr, it has the RTL info from the previous iteration. */
> >> > >
> >> > > static rtx
> >> > > builtin_memset_gen_str (void *data, void *prevp,
> >> > > HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >> > > scalar_int_mode mode)
> >> > >
> >> > > It is a broadcast. If a target can broadcast a byte to a wider integer,
> >> > > can you suggest a way to use it in the current by-pieces infrastructure?
> >> >
> >> > I meant that we should extend the by-pieces infrastructure so that
> >> > it no longer requires scalar_int_mode, rather than work within the
> >> > current limits.
> >> >
> >> > IMO the fact that we're (legitimately) going through vec_duplicate_optab
> >>
> >> If vec_duplicate_optab is the only blocking issue, can we add an integer
> >> broadcast optab? I want to avoid a major surgery just because we
> >> don't support integer broadcast from byte.
> >>
> >> > shows that we really are dealing with vectors here, not integers.
> >> >
> >>
> >> Are you suggesting we should add the optional vector mode
> >> support to by-pieces for memset?
> >>
> >
> > Like this?
> >
> > 1. Replace scalar_int_mode with machine_mode in the by-pieces
> > infrastructure to allow non-integer mode.
>
> I guess it might be better to use fixed_size_mode instead of
> machine_mode.
Fixed.
> > 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
> > to return QI vector mode for memset.
> > 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
> > integer or QI vector mode.
> >
> > Thanks.
> >
> > --
> > H.J.
> >
> > From 171f0a919300a8aac58dfe5fc9904eaa8fe0a05b Mon Sep 17 00:00:00 2001
> > From: "H.J. Lu" <hjl.tools@gmail.com>
> > Date: Sun, 6 Mar 2016 06:38:21 -0800
> > Subject: [PATCH] Add QI vector mode support to by-pieces for memset
> >
> > 1. Replace scalar_int_mode with machine_mode in the by-pieces
> > infrastructure to allow non-integer mode.
> > 2. Rename widest_int_mode_for_size to widest_int_or_QI_vector_mode_for_size
> > to return QI vector mode for memset.
> > 3. Add op_by_pieces_d::smallest_mode_for_size to return the smallest
> > integer or QI vector mode.
> >
> > gcc/
> >
> > PR middle-end/90773
> > * builtins.c (builtin_memcpy_read_str): Change the mode argument
> > from scalar_int_mode to machine_mode.
> > (builtin_strncpy_read_str): Likewise.
> > (gen_memset_value_from_prev): New function.
> > (gen_memset_broadcast): Likewise.
> > (builtin_memset_read_str): Change the mode argument from
> > scalar_int_mode to machine_mode. Use gen_memset_value_from_prev
> > and gen_memset_broadcast.
> > (builtin_memset_gen_str): Likewise.
> > (try_store_by_multiple_pieces): Use by_pieces_constfn to declare
> > constfun.
> > * builtins.h (builtin_strncpy_read_str): Updated.
> > (builtin_memset_read_str): Likewise.
> > * expr.c (widest_int_mode_for_size): Renamed to ...
> > (widest_int_or_QI_vector_mode_for_size): Add a bool argument to
> > indicate if QI vector mode can be used.
> > (by_pieces_ninsns): Call widest_int_or_QI_vector_mode_for_size
> > instead of widest_int_mode_for_size.
> > (pieces_addr::adjust): Change the mode argument from
> > scalar_int_mode to machine_mode.
> > (op_by_pieces_d): Add a bool member, m_qi_vector_mode, to
> > indicate that QI vector mode can be used.
> > (op_by_pieces_d::op_by_pieces_d): Add a bool argument to initialize
> > m_qi_vector_mode. Call widest_int_or_QI_vector_mode_for_size
> > instead of widest_int_mode_for_size.
> > (op_by_pieces_d::get_usable_mode): Change the mode argument from
> > scalar_int_mode to machine_mode. Call
> > widest_int_or_QI_vector_mode_for_size instead of
> > widest_int_mode_for_size.
> > (op_by_pieces_d::smallest_mode_for_size): New member function
> > to return the smallest integer or QI vector mode.
> > (op_by_pieces_d::run): Call widest_int_or_QI_vector_mode_for_size
> > instead of widest_int_mode_for_size. Call smallest_mode_for_size
> > instead of smallest_int_mode_for_size.
> > (store_by_pieces_d::store_by_pieces_d): Add a bool argument to
> > indicate that QI vector mode can be used and pass it to
> > op_by_pieces_d::op_by_pieces_d.
> > (can_store_by_pieces): Call widest_int_or_QI_vector_mode_for_size
> > instead of widest_int_mode_for_size.
> > (store_by_pieces::store_by_pieces): Pass memsetp to
> > store_by_pieces_d::store_by_pieces_d.
> > (clear_by_pieces_1): Change scalar_int_mode to machine_mode.
> > (string_cst_read_str): Change the mode argument from
> > scalar_int_mode to machine_mode.
> > * expr.h (by_pieces_constfn): Change scalar_int_mode to
> > machine_mode.
> > (by_pieces_prev): Likewise.
> > * target.def (gen_memset_scratch_rtx): New hook.
> > * doc/tm.texi.in: Add TARGET_GEN_MEMSET_SCRATCH_RTX.
> > * doc/tm.texi: Regenerated.
> >
> > gcc/testsuite/
> >
> > * gcc.target/i386/pr100865-3.c: Expect vmovdqu8 instead of
> > vmovdqu.
> > * gcc.target/i386/pr100865-4b.c: Likewise.
> > ---
> > gcc/builtins.c | 138 ++++++++++++++-----
> > gcc/builtins.h | 4 +-
> > gcc/doc/tm.texi | 5 +
> > gcc/doc/tm.texi.in | 2 +
> > gcc/expr.c | 145 ++++++++++++++------
> > gcc/expr.h | 4 +-
> > gcc/target.def | 7 +
> > gcc/testsuite/gcc.target/i386/pr100865-3.c | 2 +-
> > gcc/testsuite/gcc.target/i386/pr100865-4b.c | 2 +-
> > 9 files changed, 229 insertions(+), 80 deletions(-)
> >
> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> > index 39ab139b7e1..876378b467c 100644
> > --- a/gcc/builtins.c
> > +++ b/gcc/builtins.c
> > @@ -3890,13 +3890,14 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)
> >
> > static rtx
> > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> > - scalar_int_mode mode)
> > + machine_mode mode)
> > {
> > /* The REPresentation pointed to by DATA need not be a nul-terminated
> > string but the caller guarantees it's large enough for MODE. */
> > const char *rep = (const char *) data;
> >
> > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
> > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
> > + /*nul_terminated=*/false);
> > }
> >
> > /* LEN specify length of the block of memcpy/memset operation.
> > @@ -6478,14 +6479,14 @@ expand_builtin_stpncpy (tree exp, rtx)
> >
> > rtx
> > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> > - scalar_int_mode mode)
> > + machine_mode mode)
> > {
> > const char *str = (const char *) data;
> >
> > if ((unsigned HOST_WIDE_INT) offset > strlen (str))
> > return const0_rtx;
> >
> > - return c_readstr (str + offset, mode);
> > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
> > }
> > /* Helper to check the sizes of sequences and the destination of calls
>
> I think both of these as_a<scalar_int_mode>s should have a comment
> explaining that we don't (yet?) consider using vector modes for this
> operation.
Fixed.
> > @@ -6686,30 +6687,109 @@ expand_builtin_strncpy (tree exp, rtx target)
> > return NULL_RTX;
> > }
> >
> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> > - bytes from constant string DATA + OFFSET and return it as target
> > - constant. If PREV isn't nullptr, it has the RTL info from the
> > +/* Return the RTL of a register in MODE generated from PREV in the
> > previous iteration. */
> >
> > -rtx
> > -builtin_memset_read_str (void *data, void *prevp,
> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > - scalar_int_mode mode)
> > +static rtx
> > +gen_memset_value_from_prev (void *prevp, machine_mode mode)
> > {
> > + rtx target = nullptr;
> > by_pieces_prev *prev = (by_pieces_prev *) prevp;
>
> I think it'd be better to pass the proper type instead, since this
> function isn't directly implementing a callback interface.
Fixed.
> > if (prev != nullptr && prev->data != nullptr)
> > {
> > /* Use the previous data in the same mode. */
> > if (prev->mode == mode)
> > return prev->data;
> > +
> > + rtx prev_rtx = prev->data;
> > + machine_mode prev_mode = prev->mode;
> > + unsigned int word_size = GET_MODE_SIZE (word_mode);
> > + if (word_size < GET_MODE_SIZE (prev->mode).to_constant ()
> > + && word_size > GET_MODE_SIZE (mode).to_constant ())
>
> Using fixed_size_mode would also avoid the to_constants here.
Fixed.
> UNITS_PER_WORD is more direct/canonical than GET_MODE_SIZE (word_mode).
Fixed.
> > + {
> > + /* First generate subreg of word mode if the previous mode is
> > + wider than word mode and word mode is wider than MODE. */
> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> > + prev_mode, 0);
> > + prev_mode = word_mode;
> > + }
> > + if (prev_rtx != nullptr)
> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
>
> This should be lowpart_subreg, since 0 isn't the right offset for
> big-endian targets. Using lowpart_subreg should also avoid the need
> for the word_size “if” above: lowpart_subreg can handle lowpart subword
> subregs of multiword values.
I tried it. It didn't work since it caused the LRA failure. I replaced
simplify_gen_subreg with lowpart_subreg instead.
> > + }
> > + return target;
> > +}
> > +
> > +/* Return the RTL of a register in MODE broadcasted from DATA. */
> > +
> > +static rtx
> > +gen_memset_broadcast (rtx data, machine_mode mode)
> > +{
> > + /* Skip if regno_reg_rtx isn't initialized or not vector mode. */
>
> Why's the regno_reg_rtx condition needed?
Removed.
> > + if (!regno_reg_rtx || !VECTOR_MODE_P (mode))
> > + return nullptr;
> > +
> > + gcc_assert (GET_MODE_INNER (mode) == QImode);
> > +
> > + enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
> > +
> > + gcc_assert (icode != CODE_FOR_nothing);
>
> Might be worth a comment here saying that having vec_duplicate_optab is
> a precondition for picking a vector mode.
I added a comment.
> > +
> > + rtx target = targetm.gen_memset_scratch_rtx (mode);
> > + if (CONST_INT_P (data))
> > + {
> > + /* Use the move expander with CONST_VECTOR. */
> > + rtx const_vec = gen_const_vec_duplicate (mode, data);
> > + emit_move_insn (target, const_vec);
> > + }
> > + else
> > + {
> > +
>
> Nit: excess blank line.
Fixed.
> > + class expand_operand ops[2];
> > + create_output_operand (&ops[0], target, mode);
> > + create_input_operand (&ops[1], data, QImode);
> > + expand_insn (icode, 2, ops);
> > + if (!rtx_equal_p (target, ops[0].value))
> > + emit_move_insn (target, ops[0].value);
> > }
> >
> > + return target;
> > +}
> > +
> > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> > + bytes from constant string DATA + OFFSET and return it as target
> > + constant. If PREV isn't nullptr, it has the RTL info from the
> > + previous iteration. */
> > +
> > +rtx
> > +builtin_memset_read_str (void *data, void *prev,
> > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > + machine_mode mode)
> > +{
> > + rtx target;
> > const char *c = (const char *) data;
> > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> > + char *p;
> > + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> > +
> > + /* Don't use the previous value if size is 1. */
>
> Why not though? The existing code does, and that seems like the right
> thing to do when operating on integers.
>
> I can see the check would be a good thing to do if MODE isn't a vector
> mode and the previous mode was. Is that the case you're worried about?
> If so, that check could go in gen_memset_value_from_prev instead.
We are storing one byte. Doing it directly is faster.
> > + if (size != 1)
> > + {
> > + target = gen_memset_value_from_prev (prev, mode);
> > + if (target != nullptr)
> > + return target;
> > +
> > + p = XALLOCAVEC (char, GET_MODE_SIZE (QImode));
> > + memset (p, *c, GET_MODE_SIZE (QImode));
> > + rtx src = c_readstr (p, QImode);
>
> This seems unnecessarily complicated. Couldn't it just be
>
> gen_int_mode (QImode, *c);
>
> instead? There are no endianness concerns for single QI values.
Fixed.
> > + target = gen_memset_broadcast (src, mode);
> > + if (target != nullptr)
> > + return target;
> > + }
> > +
> > + p = XALLOCAVEC (char, size);
> >
> > - memset (p, *c, GET_MODE_SIZE (mode));
> > + memset (p, *c, size);
> >
> > - return c_readstr (p, mode);
> > + return c_readstr (p, as_a <scalar_int_mode> (mode));
> > }
> >
> > /* Callback routine for store_by_pieces. Return the RTL of a register
> > @@ -6719,33 +6799,29 @@ builtin_memset_read_str (void *data, void *prevp,
> > nullptr, it has the RTL info from the previous iteration. */
> >
> > static rtx
> > -builtin_memset_gen_str (void *data, void *prevp,
> > +builtin_memset_gen_str (void *data, void *prev,
> > HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > - scalar_int_mode mode)
> > + machine_mode mode)
> > {
> > rtx target, coeff;
> > size_t size;
> > char *p;
> >
> > - by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > - if (prev != nullptr && prev->data != nullptr)
> > - {
> > - /* Use the previous data in the same mode. */
> > - if (prev->mode == mode)
> > - return prev->data;
> > -
> > - target = simplify_gen_subreg (mode, prev->data, prev->mode, 0);
> > - if (target != nullptr)
> > - return target;
> > - }
> > -
> > - size = GET_MODE_SIZE (mode);
> > + size = GET_MODE_SIZE (mode).to_constant ();
> > if (size == 1)
> > return (rtx) data;
> >
> > + target = gen_memset_value_from_prev (prev, mode);
> > + if (target != nullptr)
> > + return target;
> > +
> > + target = gen_memset_broadcast ((rtx) data, mode);
> > + if (target != nullptr)
> > + return target;
> > +
> > p = XALLOCAVEC (char, size);
> > memset (p, 1, size);
> > - coeff = c_readstr (p, mode);
> > + coeff = c_readstr (p, as_a <scalar_int_mode> (mode));
> >
> > target = convert_to_mode (mode, (rtx) data, 1);
> > target = expand_mult (mode, target, coeff, NULL_RTX, 1);
> > @@ -6849,7 +6925,7 @@ try_store_by_multiple_pieces (rtx to, rtx len, unsigned int ctz_len,
> > &valc, align, true))
> > return false;
> >
> > - rtx (*constfun) (void *, void *, HOST_WIDE_INT, scalar_int_mode);
> > + by_pieces_constfn constfun;
> > void *constfundata;
> > if (val)
> > {
> > diff --git a/gcc/builtins.h b/gcc/builtins.h
> > index a64ece3f1cd..31d07621750 100644
> > --- a/gcc/builtins.h
> > +++ b/gcc/builtins.h
> > @@ -111,9 +111,9 @@ extern tree mathfn_built_in (tree, enum built_in_function fn);
> > extern tree mathfn_built_in (tree, combined_fn);
> > extern tree mathfn_built_in_type (combined_fn);
> > extern rtx builtin_strncpy_read_str (void *, void *, HOST_WIDE_INT,
> > - scalar_int_mode);
> > + machine_mode);
> > extern rtx builtin_memset_read_str (void *, void *, HOST_WIDE_INT,
> > - scalar_int_mode);
> > + machine_mode);
> > extern rtx expand_builtin_saveregs (void);
> > extern tree std_build_builtin_va_list (void);
> > extern tree std_fn_abi_va_list (tree);
> > diff --git a/gcc/doc/tm.texi b/gcc/doc/tm.texi
> > index 3ad39443eba..c1995270720 100644
> > --- a/gcc/doc/tm.texi
> > +++ b/gcc/doc/tm.texi
> > @@ -12123,6 +12123,11 @@ This function prepares to emit a conditional comparison within a sequence
> > @var{bit_code} is @code{AND} or @code{IOR}, which is the op on the compares.
> > @end deftypefn
> >
> > +@deftypefn {Target Hook} rtx TARGET_GEN_MEMSET_SCRATCH_RTX (machine_mode @var{mode})
> > +This hook should return an rtx for scratch register in @var{mode} to
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.
>
> This should give a hint why the default might not be appropriate.
Updated.
> > +@end deftypefn
> > +
> > @deftypefn {Target Hook} unsigned TARGET_LOOP_UNROLL_ADJUST (unsigned @var{nunroll}, class loop *@var{loop})
> > This target hook returns a new value for the number of times @var{loop}
> > should be unrolled. The parameter @var{nunroll} is the number of times
> > diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
> > index f881cdabe9e..a6bbf4f2667 100644
> > --- a/gcc/doc/tm.texi.in
> > +++ b/gcc/doc/tm.texi.in
> > @@ -7958,6 +7958,8 @@ lists.
> >
> > @hook TARGET_GEN_CCMP_NEXT
> >
> > +@hook TARGET_GEN_MEMSET_SCRATCH_RTX
> > +
> > @hook TARGET_LOOP_UNROLL_ADJUST
> >
> > @defmac POWI_MAX_MULTS
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index 6a4368113c4..92dbf47194e 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -769,15 +769,36 @@ alignment_for_piecewise_move (unsigned int max_pieces, unsigned int align)
> > return align;
> > }
> >
> > -/* Return the widest integer mode that is narrower than SIZE bytes. */
> > +/* Return the widest integer or QI vector mode that is narrower than
> > + SIZE bytes. */
> >
> > -static scalar_int_mode
> > -widest_int_mode_for_size (unsigned int size)
> > +static machine_mode
> > +widest_int_or_QI_vector_mode_for_size (unsigned int size,
> > + bool qi_vector)
> > {
> > - scalar_int_mode result = NARROWEST_INT_MODE;
> > + machine_mode result = NARROWEST_INT_MODE;
> >
> > gcc_checking_assert (size > 1);
> >
> > + /* Use QI vector only if size is wider than a WORD. */
> > + if (qi_vector && size > GET_MODE_SIZE (word_mode))
> > + {
> > + machine_mode mode;
> > + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> > + if (GET_MODE_INNER (mode) == QImode
> > + && GET_MODE_SIZE (mode).is_constant ())
> > + {
> > + if (GET_MODE_SIZE (mode).to_constant () >= size)
> > + break;
> > + if (optab_handler (vec_duplicate_optab, mode)
> > + != CODE_FOR_nothing)
> > + result = mode;
> > + }
> > +
> > + if (result != NARROWEST_INT_MODE)
> > + return result;
> > + }
> > +
> > opt_scalar_int_mode tmode;
> > FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
> > if (GET_MODE_SIZE (tmode.require ()) < size)
> > @@ -815,16 +836,18 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
> > unsigned int max_size, by_pieces_operation op)
> > {
> > unsigned HOST_WIDE_INT n_insns = 0;
> > - scalar_int_mode mode;
> > + machine_mode mode;
> >
> > if (targetm.overlap_op_by_pieces_p () && op != COMPARE_BY_PIECES)
> > {
> > /* NB: Round up L and ALIGN to the widest integer mode for
> > MAX_SIZE. */
> > - mode = widest_int_mode_for_size (max_size);
> > + mode = widest_int_or_QI_vector_mode_for_size (max_size,
> > + op == SET_BY_PIECES);
> > if (optab_handler (mov_optab, mode) != CODE_FOR_nothing)
> > {
> > - unsigned HOST_WIDE_INT up = ROUND_UP (l, GET_MODE_SIZE (mode));
> > + unsigned HOST_WIDE_INT up = ROUND_UP (l,
> > + GET_MODE_SIZE (mode).to_constant ());
> > if (up > l)
> > l = up;
> > align = GET_MODE_ALIGNMENT (mode);
> > @@ -835,10 +858,11 @@ by_pieces_ninsns (unsigned HOST_WIDE_INT l, unsigned int align,
> >
> > while (max_size > 1 && l > 0)
> > {
> > - mode = widest_int_mode_for_size (max_size);
> > + mode = widest_int_or_QI_vector_mode_for_size (max_size,
> > + op == SET_BY_PIECES);
> > enum insn_code icode;
> >
> > - unsigned int modesize = GET_MODE_SIZE (mode);
> > + unsigned int modesize = GET_MODE_SIZE (mode).to_constant ();
> >
> > icode = optab_handler (mov_optab, mode);
> > if (icode != CODE_FOR_nothing && align >= GET_MODE_ALIGNMENT (mode))
> > @@ -903,8 +927,7 @@ class pieces_addr
> > void *m_cfndata;
> > public:
> > pieces_addr (rtx, bool, by_pieces_constfn, void *);
> > - rtx adjust (scalar_int_mode, HOST_WIDE_INT,
> > - by_pieces_prev * = nullptr);
> > + rtx adjust (machine_mode, HOST_WIDE_INT, by_pieces_prev * = nullptr);
> > void increment_address (HOST_WIDE_INT);
> > void maybe_predec (HOST_WIDE_INT);
> > void maybe_postinc (HOST_WIDE_INT);
> > @@ -1006,7 +1029,7 @@ pieces_addr::decide_autoinc (machine_mode ARG_UNUSED (mode), bool reverse,
> > but we still modify the MEM's properties. */
> >
> > rtx
> > -pieces_addr::adjust (scalar_int_mode mode, HOST_WIDE_INT offset,
> > +pieces_addr::adjust (machine_mode mode, HOST_WIDE_INT offset,
> > by_pieces_prev *prev)
> > {
> > if (m_constfn)
> > @@ -1060,7 +1083,8 @@ pieces_addr::maybe_postinc (HOST_WIDE_INT size)
> > class op_by_pieces_d
> > {
> > private:
> > - scalar_int_mode get_usable_mode (scalar_int_mode mode, unsigned int);
> > + machine_mode get_usable_mode (machine_mode, unsigned int);
> > + machine_mode smallest_mode_for_size (unsigned int);
> >
> > protected:
> > pieces_addr m_to, m_from;
> > @@ -1073,6 +1097,8 @@ class op_by_pieces_d
> > bool m_push;
> > /* True if targetm.overlap_op_by_pieces_p () returns true. */
> > bool m_overlap_op_by_pieces;
> > + /* True if QI vector mode can be used. */
> > + bool m_qi_vector_mode;
> >
> > /* Virtual functions, overriden by derived classes for the specific
> > operation. */
> > @@ -1084,7 +1110,8 @@ class op_by_pieces_d
> >
> > public:
> > op_by_pieces_d (rtx, bool, rtx, bool, by_pieces_constfn, void *,
> > - unsigned HOST_WIDE_INT, unsigned int, bool);
> > + unsigned HOST_WIDE_INT, unsigned int, bool,
> > + bool = false);
> > void run ();
> > };
> >
> > @@ -1099,11 +1126,12 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> > by_pieces_constfn from_cfn,
> > void *from_cfn_data,
> > unsigned HOST_WIDE_INT len,
> > - unsigned int align, bool push)
> > + unsigned int align, bool push,
> > + bool qi_vector_mode)
> > : m_to (to, to_load, NULL, NULL),
> > m_from (from, from_load, from_cfn, from_cfn_data),
> > m_len (len), m_max_size (MOVE_MAX_PIECES + 1),
> > - m_push (push)
> > + m_push (push), m_qi_vector_mode (qi_vector_mode)
> > {
> > int toi = m_to.get_addr_inc ();
> > int fromi = m_from.get_addr_inc ();
> > @@ -1124,7 +1152,9 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> > if (by_pieces_ninsns (len, align, m_max_size, MOVE_BY_PIECES) > 2)
> > {
> > /* Find the mode of the largest comparison. */
> > - scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
> > + machine_mode mode
> > + = widest_int_or_QI_vector_mode_for_size (m_max_size,
> > + m_qi_vector_mode);
> >
> > m_from.decide_autoinc (mode, m_reverse, len);
> > m_to.decide_autoinc (mode, m_reverse, len);
> > @@ -1139,22 +1169,42 @@ op_by_pieces_d::op_by_pieces_d (rtx to, bool to_load,
> > /* This function returns the largest usable integer mode for LEN bytes
> > whose size is no bigger than size of MODE. */
> >
> > -scalar_int_mode
> > -op_by_pieces_d::get_usable_mode (scalar_int_mode mode, unsigned int len)
> > +machine_mode
> > +op_by_pieces_d::get_usable_mode (machine_mode mode, unsigned int len)
> > {
> > unsigned int size;
> > do
> > {
> > - size = GET_MODE_SIZE (mode);
> > + size = GET_MODE_SIZE (mode).to_constant ();
> > if (len >= size && prepare_mode (mode, m_align))
> > break;
> > - /* NB: widest_int_mode_for_size checks SIZE > 1. */
> > - mode = widest_int_mode_for_size (size);
> > + /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */
> > + mode = widest_int_or_QI_vector_mode_for_size (size,
> > + m_qi_vector_mode);
> > }
> > while (1);
> > return mode;
> > }
> >
> > +/* Return the smallest integer or QI vector mode that is not narrower
> > + than SIZE bits. */
> > +
> > +machine_mode
> > +op_by_pieces_d::smallest_mode_for_size (unsigned int size)
> > +{
> > + /* Use QI vector only for > size of WORD. */
> > + if (m_qi_vector_mode && size > GET_MODE_PRECISION (word_mode))
> > + {
> > + machine_mode mode;
> > + FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> > + if (GET_MODE_INNER (mode) == QImode
> > + && known_ge (GET_MODE_PRECISION (mode), size))
> > + return mode;
>
> Shouldn't this also test for vec_duplicate_optab? I guess you're
> hoping that the previous register will be reused via a lowpart,
> but since that's up to the callbacks, I think we should be
> conservative here.
Fixed. I will submit the v2 patch.
Thanks.
> Alternatively, we could bypass the callbacks when this code is used.
>
> Thanks,
> Richard
>
> > + }
> > +
> > + return smallest_int_mode_for_size (size);
> > +}
> > +
> > /* This function contains the main loop used for expanding a block
> > operation. First move what we can in the largest integer mode,
> > then go to successively smaller modes. For every access, call
> > @@ -1166,8 +1216,10 @@ op_by_pieces_d::run ()
> > if (m_len == 0)
> > return;
> >
> > - /* NB: widest_int_mode_for_size checks M_MAX_SIZE > 1. */
> > - scalar_int_mode mode = widest_int_mode_for_size (m_max_size);
> > + /* NB: widest_int_or_QI_vector_mode_for_size checks M_MAX_SIZE > 1. */
> > + machine_mode mode
> > + = widest_int_or_QI_vector_mode_for_size (m_max_size,
> > + m_qi_vector_mode);
> > mode = get_usable_mode (mode, m_len);
> >
> > by_pieces_prev to_prev = { nullptr, mode };
> > @@ -1175,7 +1227,7 @@ op_by_pieces_d::run ()
> >
> > do
> > {
> > - unsigned int size = GET_MODE_SIZE (mode);
> > + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> > rtx to1 = NULL_RTX, from1;
> >
> > while (m_len >= size)
> > @@ -1214,9 +1266,10 @@ op_by_pieces_d::run ()
> > /* NB: Generate overlapping operations if it is not a stack
> > push since stack push must not overlap. Get the smallest
> > integer mode for M_LEN bytes. */
> > - mode = smallest_int_mode_for_size (m_len * BITS_PER_UNIT);
> > - mode = get_usable_mode (mode, GET_MODE_SIZE (mode));
> > - int gap = GET_MODE_SIZE (mode) - m_len;
> > + mode = smallest_mode_for_size (m_len * BITS_PER_UNIT);
> > + unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
> > + mode = get_usable_mode (mode, mode_size);
> > + int gap = mode_size - m_len;
> > if (gap > 0)
> > {
> > /* If size of MODE > M_LEN, generate the last operation
> > @@ -1231,8 +1284,9 @@ op_by_pieces_d::run ()
> > }
> > else
> > {
> > - /* NB: widest_int_mode_for_size checks SIZE > 1. */
> > - mode = widest_int_mode_for_size (size);
> > + /* NB: widest_int_or_QI_vector_mode_for_size checks SIZE > 1. */
> > + mode = widest_int_or_QI_vector_mode_for_size (size,
> > + m_qi_vector_mode);
> > mode = get_usable_mode (mode, m_len);
> > }
> > }
> > @@ -1355,9 +1409,10 @@ class store_by_pieces_d : public op_by_pieces_d
> >
> > public:
> > store_by_pieces_d (rtx to, by_pieces_constfn cfn, void *cfn_data,
> > - unsigned HOST_WIDE_INT len, unsigned int align)
> > + unsigned HOST_WIDE_INT len, unsigned int align,
> > + bool qi_vector_mode = false)
> > : op_by_pieces_d (to, false, NULL_RTX, true, cfn, cfn_data, len,
> > - align, false)
> > + align, false, qi_vector_mode)
> > {
> > }
> > rtx finish_retmode (memop_ret);
> > @@ -1446,13 +1501,14 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
> > max_size = STORE_MAX_PIECES + 1;
> > while (max_size > 1 && l > 0)
> > {
> > - scalar_int_mode mode = widest_int_mode_for_size (max_size);
> > + machine_mode mode
> > + = widest_int_or_QI_vector_mode_for_size (max_size, memsetp);
> >
> > icode = optab_handler (mov_optab, mode);
> > if (icode != CODE_FOR_nothing
> > && align >= GET_MODE_ALIGNMENT (mode))
> > {
> > - unsigned int size = GET_MODE_SIZE (mode);
> > + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> >
> > while (l >= size)
> > {
> > @@ -1470,7 +1526,7 @@ can_store_by_pieces (unsigned HOST_WIDE_INT len,
> > }
> > }
> >
> > - max_size = GET_MODE_SIZE (mode);
> > + max_size = GET_MODE_SIZE (mode).to_constant ();
> > }
> >
> > /* The code above should have handled everything. */
> > @@ -1504,7 +1560,8 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
> > memsetp ? SET_BY_PIECES : STORE_BY_PIECES,
> > optimize_insn_for_speed_p ()));
> >
> > - store_by_pieces_d data (to, constfun, constfundata, len, align);
> > + store_by_pieces_d data (to, constfun, constfundata, len, align,
> > + memsetp);
> > data.run ();
> >
> > if (retmode != RETURN_BEGIN)
> > @@ -1517,7 +1574,7 @@ store_by_pieces (rtx to, unsigned HOST_WIDE_INT len,
> > Return const0_rtx unconditionally. */
> >
> > static rtx
> > -clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, scalar_int_mode)
> > +clear_by_pieces_1 (void *, void *, HOST_WIDE_INT, machine_mode)
> > {
> > return const0_rtx;
> > }
> > @@ -5754,7 +5811,7 @@ emit_storent_insn (rtx to, rtx from)
> >
> > static rtx
> > string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
> > - scalar_int_mode mode)
> > + machine_mode mode)
> > {
> > tree str = (tree) data;
> >
> > @@ -5762,17 +5819,19 @@ string_cst_read_str (void *data, void *, HOST_WIDE_INT offset,
> > if (offset >= TREE_STRING_LENGTH (str))
> > return const0_rtx;
> >
> > - if ((unsigned HOST_WIDE_INT) offset + GET_MODE_SIZE (mode)
> > + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> > + if ((unsigned HOST_WIDE_INT) offset + size
> > > (unsigned HOST_WIDE_INT) TREE_STRING_LENGTH (str))
> > {
> > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> > + char *p = XALLOCAVEC (char, size);
> > size_t l = TREE_STRING_LENGTH (str) - offset;
> > memcpy (p, TREE_STRING_POINTER (str) + offset, l);
> > - memset (p + l, '\0', GET_MODE_SIZE (mode) - l);
> > - return c_readstr (p, mode, false);
> > + memset (p + l, '\0', size - l);
> > + return c_readstr (p, as_a <scalar_int_mode> (mode), false);
> > }
> >
> > - return c_readstr (TREE_STRING_POINTER (str) + offset, mode, false);
> > + return c_readstr (TREE_STRING_POINTER (str) + offset,
> > + as_a <scalar_int_mode> (mode), false);
> > }
> >
> > /* Generate code for computing expression EXP,
> > diff --git a/gcc/expr.h b/gcc/expr.h
> > index a4f44265759..552645e4403 100644
> > --- a/gcc/expr.h
> > +++ b/gcc/expr.h
> > @@ -108,13 +108,13 @@ enum block_op_methods
> > };
> >
> > typedef rtx (*by_pieces_constfn) (void *, void *, HOST_WIDE_INT,
> > - scalar_int_mode);
> > + machine_mode);
> >
> > /* The second pointer passed to by_pieces_constfn. */
> > struct by_pieces_prev
> > {
> > rtx data;
> > - scalar_int_mode mode;
> > + machine_mode mode;
> > };
> >
> > extern rtx emit_block_move (rtx, rtx, rtx, enum block_op_methods);
> > diff --git a/gcc/target.def b/gcc/target.def
> > index 2e40448e6c5..cc4aa3a4212 100644
> > --- a/gcc/target.def
> > +++ b/gcc/target.def
> > @@ -2726,6 +2726,13 @@ DEFHOOK
> > rtx, (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev, int cmp_code, tree op0, tree op1, int bit_code),
> > NULL)
> >
> > +DEFHOOK
> > +(gen_memset_scratch_rtx,
> > + "This hook should return an rtx for scratch register in @var{mode} to\n\
> > +be used by memset broadcast. The default is @code{gen_reg_rtx}.",
> > + rtx, (machine_mode mode),
> > + gen_reg_rtx)
> > +
> > /* Return a new value for loop unroll size. */
> > DEFHOOK
> > (loop_unroll_adjust,
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-3.c b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > index b6dbcf7809b..007e79f91b0 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-3.c
> > @@ -10,6 +10,6 @@ foo (void)
> > }
> >
> > /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, \\(%\[\^,\]+\\)" 1 } } */
> > /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
> > /* { dg-final { scan-assembler-not "vmovdqa" } } */
> > diff --git a/gcc/testsuite/gcc.target/i386/pr100865-4b.c b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > index f41e6147b4c..1e50dc842bc 100644
> > --- a/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > +++ b/gcc/testsuite/gcc.target/i386/pr100865-4b.c
> > @@ -4,6 +4,6 @@
> > #include "pr100865-4a.c"
> >
> > /* { dg-final { scan-assembler-times "vpbroadcastb\[\\t \]+%(?:r|e)\[^\n\]*, %xmm\[0-9\]+" 1 } } */
> > -/* { dg-final { scan-assembler-times "vmovdqu\[\\t \]%xmm\[0-9\]+, " 4 } } */
> > +/* { dg-final { scan-assembler-times "vmovdqu8\[\\t \]%xmm\[0-9\]+, " 4 } } */
> > /* { dg-final { scan-assembler-not "vpbroadcastb\[\\t \]+%xmm\[0-9\]+, %xmm\[0-9\]+" } } */
> > /* { dg-final { scan-assembler-not "vmovdqa" } } */
--
H.J.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Add QI vector mode support to by-pieces for memset
2021-07-20 2:43 ` H.J. Lu
@ 2021-07-20 6:38 ` Richard Sandiford
2021-07-20 12:48 ` H.J. Lu
0 siblings, 1 reply; 14+ messages in thread
From: Richard Sandiford @ 2021-07-20 6:38 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches; +Cc: Jeff Law, H.J. Lu
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> > + {
>> > + /* First generate subreg of word mode if the previous mode is
>> > + wider than word mode and word mode is wider than MODE. */
>> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
>> > + prev_mode, 0);
>> > + prev_mode = word_mode;
>> > + }
>> > + if (prev_rtx != nullptr)
>> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
>>
>> This should be lowpart_subreg, since 0 isn't the right offset for
>> big-endian targets. Using lowpart_subreg should also avoid the need
>> for the word_size “if” above: lowpart_subreg can handle lowpart subword
>> subregs of multiword values.
>
> I tried it. It didn't work since it caused the LRA failure. I replaced
> simplify_gen_subreg with lowpart_subreg instead.
What specifically went wrong?
>> > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
>> > + bytes from constant string DATA + OFFSET and return it as target
>> > + constant. If PREV isn't nullptr, it has the RTL info from the
>> > + previous iteration. */
>> > +
>> > +rtx
>> > +builtin_memset_read_str (void *data, void *prev,
>> > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
>> > + machine_mode mode)
>> > +{
>> > + rtx target;
>> > const char *c = (const char *) data;
>> > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
>> > + char *p;
>> > + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
>> > +
>> > + /* Don't use the previous value if size is 1. */
>>
>> Why not though? The existing code does, and that seems like the right
>> thing to do when operating on integers.
>>
>> I can see the check would be a good thing to do if MODE isn't a vector
>> mode and the previous mode was. Is that the case you're worried about?
>> If so, that check could go in gen_memset_value_from_prev instead.
>
> We are storing one byte. Doing it directly is faster.
But the first thing being protected here is…
>> > + if (size != 1)
>> > + {
>> > + target = gen_memset_value_from_prev (prev, mode);
>> > + if (target != nullptr)
>> > + return target;
…this attempt to use the previous value. If the target uses, say,
SImode for the first piece and QImode for a final byte, using the QImode
lowpart of the SImode register would avoid having to move the byte value
into a separate QImode register. Why's that a bad thing to do? AFAICT
it's what the current code would do, so if we want to change it even for
integer modes, I think it should be a separate patch with a separate
justification.
Like I say, I can understand that using the QImode lowpart of a vector
wouldn't be a good idea. But if that's specifically what you're trying
to prevent, I think we should test for it.
Thanks,
Richard
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Add QI vector mode support to by-pieces for memset
2021-07-20 6:38 ` Richard Sandiford
@ 2021-07-20 12:48 ` H.J. Lu
2021-07-20 13:10 ` H.J. Lu
2021-07-20 14:26 ` Richard Sandiford
0 siblings, 2 replies; 14+ messages in thread
From: H.J. Lu @ 2021-07-20 12:48 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches, Jeff Law, H.J. Lu, Richard Sandiford
On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> > + {
> >> > + /* First generate subreg of word mode if the previous mode is
> >> > + wider than word mode and word mode is wider than MODE. */
> >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> >> > + prev_mode, 0);
> >> > + prev_mode = word_mode;
> >> > + }
> >> > + if (prev_rtx != nullptr)
> >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> >>
> >> This should be lowpart_subreg, since 0 isn't the right offset for
> >> big-endian targets. Using lowpart_subreg should also avoid the need
> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword
> >> subregs of multiword values.
> >
> > I tried it. It didn't work since it caused the LRA failure. I replaced
> > simplify_gen_subreg with lowpart_subreg instead.
>
> What specifically went wrong?
With vector broadcast, for
---
extern void *ops;
void
foo (int c)
{
__builtin_memset (ops, c, 18);
}
---
we generate HI from V16QI. With a single lowpart_subreg, I get
(insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
(const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
*)ops.0_1]+16 S2 A8])
(subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
(nil))
instead of
(insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
(const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
*)ops.0_1]+16 S2 A8])
(subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
(nil))
IRA and LRA fail to reload:
(insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
(const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
*)ops.0_1]+16 S2 A8])
(subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
(nil))
since ix86_can_change_mode_class has
if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
{
/* Vector registers do not support QI or HImode loads. If we don't
disallow a change to these modes, reload will assume it's ok to
drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
the vec_dupv4hi pattern. */
if (GET_MODE_SIZE (from) < 4)
return false;
}
If we don't use a hard scratch register, (subreg:HI (reg:V16QI)) compiles. But
codegen is worse:
vmovd %edi, %xmm0
vpbroadcastb %xmm0, %xmm0
vmovdqa %xmm0, -24(%rsp)
movq ops(%rip), %rax
movzwl -24(%rsp), %edx
vmovdqu %xmm0, (%rax)
movw %dx, 16(%rax)
vs
vmovd %edi, %xmm15
movq ops(%rip), %rax
vpbroadcastb %xmm15, %xmm15
vmovq %xmm15, %rdx
movw %dx, 16(%rax)
vmovdqu %xmm15, (%rax)
> >> > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> >> > + bytes from constant string DATA + OFFSET and return it as target
> >> > + constant. If PREV isn't nullptr, it has the RTL info from the
> >> > + previous iteration. */
> >> > +
> >> > +rtx
> >> > +builtin_memset_read_str (void *data, void *prev,
> >> > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >> > + machine_mode mode)
> >> > +{
> >> > + rtx target;
> >> > const char *c = (const char *) data;
> >> > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> >> > + char *p;
> >> > + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> >> > +
> >> > + /* Don't use the previous value if size is 1. */
> >>
> >> Why not though? The existing code does, and that seems like the right
> >> thing to do when operating on integers.
> >>
> >> I can see the check would be a good thing to do if MODE isn't a vector
> >> mode and the previous mode was. Is that the case you're worried about?
> >> If so, that check could go in gen_memset_value_from_prev instead.
> >
> > We are storing one byte. Doing it directly is faster.
>
> But the first thing being protected here is…
>
> >> > + if (size != 1)
> >> > + {
> >> > + target = gen_memset_value_from_prev (prev, mode);
> >> > + if (target != nullptr)
> >> > + return target;
>
> …this attempt to use the previous value. If the target uses, say,
> SImode for the first piece and QImode for a final byte, using the QImode
> lowpart of the SImode register would avoid having to move the byte value
> into a separate QImode register. Why's that a bad thing to do? AFAICT
> it's what the current code would do, so if we want to change it even for
> integer modes, I think it should be a separate patch with a separate
> justification.
I removed the size == 1 check. I didn't notice any issues.
> Like I say, I can understand that using the QImode lowpart of a vector
> wouldn't be a good idea. But if that's specifically what you're trying
> to prevent, I think we should test for it.
>
> Thanks,
> Richard
I will submit the v3 patch.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Add QI vector mode support to by-pieces for memset
2021-07-20 12:48 ` H.J. Lu
@ 2021-07-20 13:10 ` H.J. Lu
2021-07-20 14:26 ` Richard Sandiford
1 sibling, 0 replies; 14+ messages in thread
From: H.J. Lu @ 2021-07-20 13:10 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches, Jeff Law, Richard Sandiford
On Tue, Jul 20, 2021 at 5:48 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
> >
> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> > >> > + {
> > >> > + /* First generate subreg of word mode if the previous mode is
> > >> > + wider than word mode and word mode is wider than MODE. */
> > >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> > >> > + prev_mode, 0);
> > >> > + prev_mode = word_mode;
> > >> > + }
> > >> > + if (prev_rtx != nullptr)
> > >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> > >>
> > >> This should be lowpart_subreg, since 0 isn't the right offset for
> > >> big-endian targets. Using lowpart_subreg should also avoid the need
> > >> for the word_size “if” above: lowpart_subreg can handle lowpart subword
> > >> subregs of multiword values.
> > >
> > > I tried it. It didn't work since it caused the LRA failure. I replaced
> > > simplify_gen_subreg with lowpart_subreg instead.
> >
> > What specifically went wrong?
>
> With vector broadcast, for
> ---
> extern void *ops;
>
> void
> foo (int c)
> {
> __builtin_memset (ops, c, 18);
> }
> ---
> we generate HI from V16QI. With a single lowpart_subreg, I get
>
> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
> *)ops.0_1]+16 S2 A8])
> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
> (nil))
>
> instead of
>
> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
> *)ops.0_1]+16 S2 A8])
> (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
> (nil))
>
> IRA and LRA fail to reload:
>
> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
> *)ops.0_1]+16 S2 A8])
> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
> (nil))
>
> since ix86_can_change_mode_class has
>
> if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
> {
> /* Vector registers do not support QI or HImode loads. If we don't
> disallow a change to these modes, reload will assume it's ok to
> drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
> the vec_dupv4hi pattern. */
> if (GET_MODE_SIZE (from) < 4)
> return false;
> }
Correction. It is ix86_hard_regno_mode_ok which doesn't allow HImode
in XMM registers.
> If we don't use a hard scratch register, (subreg:HI (reg:V16QI)) compiles. But
> codegen is worse:
>
> vmovd %edi, %xmm0
> vpbroadcastb %xmm0, %xmm0
> vmovdqa %xmm0, -24(%rsp)
> movq ops(%rip), %rax
> movzwl -24(%rsp), %edx
> vmovdqu %xmm0, (%rax)
> movw %dx, 16(%rax)
>
> vs
>
> vmovd %edi, %xmm15
> movq ops(%rip), %rax
> vpbroadcastb %xmm15, %xmm15
> vmovq %xmm15, %rdx
> movw %dx, 16(%rax)
> vmovdqu %xmm15, (%rax)
>
> > >> > +/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> > >> > + bytes from constant string DATA + OFFSET and return it as target
> > >> > + constant. If PREV isn't nullptr, it has the RTL info from the
> > >> > + previous iteration. */
> > >> > +
> > >> > +rtx
> > >> > +builtin_memset_read_str (void *data, void *prev,
> > >> > + HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > >> > + machine_mode mode)
> > >> > +{
> > >> > + rtx target;
> > >> > const char *c = (const char *) data;
> > >> > - char *p = XALLOCAVEC (char, GET_MODE_SIZE (mode));
> > >> > + char *p;
> > >> > + unsigned int size = GET_MODE_SIZE (mode).to_constant ();
> > >> > +
> > >> > + /* Don't use the previous value if size is 1. */
> > >>
> > >> Why not though? The existing code does, and that seems like the right
> > >> thing to do when operating on integers.
> > >>
> > >> I can see the check would be a good thing to do if MODE isn't a vector
> > >> mode and the previous mode was. Is that the case you're worried about?
> > >> If so, that check could go in gen_memset_value_from_prev instead.
> > >
> > > We are storing one byte. Doing it directly is faster.
> >
> > But the first thing being protected here is…
> >
> > >> > + if (size != 1)
> > >> > + {
> > >> > + target = gen_memset_value_from_prev (prev, mode);
> > >> > + if (target != nullptr)
> > >> > + return target;
> >
> > …this attempt to use the previous value. If the target uses, say,
> > SImode for the first piece and QImode for a final byte, using the QImode
> > lowpart of the SImode register would avoid having to move the byte value
> > into a separate QImode register. Why's that a bad thing to do? AFAICT
> > it's what the current code would do, so if we want to change it even for
> > integer modes, I think it should be a separate patch with a separate
> > justification.
>
> I removed the size == 1 check. I didn't notice any issues.
>
> > Like I say, I can understand that using the QImode lowpart of a vector
> > wouldn't be a good idea. But if that's specifically what you're trying
> > to prevent, I think we should test for it.
> >
> > Thanks,
> > Richard
>
> I will submit the v3 patch.
>
> Thanks.
>
> --
> H.J.
--
H.J.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Add QI vector mode support to by-pieces for memset
2021-07-20 12:48 ` H.J. Lu
2021-07-20 13:10 ` H.J. Lu
@ 2021-07-20 14:26 ` Richard Sandiford
2021-07-20 15:12 ` Richard Sandiford
1 sibling, 1 reply; 14+ messages in thread
From: Richard Sandiford @ 2021-07-20 14:26 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches; +Cc: Jeff Law, H.J. Lu
"H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford
> <richard.sandiford@arm.com> wrote:
>>
>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> >> > + {
>> >> > + /* First generate subreg of word mode if the previous mode is
>> >> > + wider than word mode and word mode is wider than MODE. */
>> >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
>> >> > + prev_mode, 0);
>> >> > + prev_mode = word_mode;
>> >> > + }
>> >> > + if (prev_rtx != nullptr)
>> >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
>> >>
>> >> This should be lowpart_subreg, since 0 isn't the right offset for
>> >> big-endian targets. Using lowpart_subreg should also avoid the need
>> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword
>> >> subregs of multiword values.
>> >
>> > I tried it. It didn't work since it caused the LRA failure. I replaced
>> > simplify_gen_subreg with lowpart_subreg instead.
>>
>> What specifically went wrong?
>
> With vector broadcast, for
> ---
> extern void *ops;
>
> void
> foo (int c)
> {
> __builtin_memset (ops, c, 18);
> }
> ---
> we generate HI from V16QI. With a single lowpart_subreg, I get
>
> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
> *)ops.0_1]+16 S2 A8])
> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
> (nil))
>
> instead of
>
> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
> *)ops.0_1]+16 S2 A8])
> (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
> (nil))
>
> IRA and LRA fail to reload:
>
> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
> *)ops.0_1]+16 S2 A8])
> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
> (nil))
>
> since ix86_can_change_mode_class has
>
> if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
> {
> /* Vector registers do not support QI or HImode loads. If we don't
> disallow a change to these modes, reload will assume it's ok to
> drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
> the vec_dupv4hi pattern. */
> if (GET_MODE_SIZE (from) < 4)
> return false;
> }
Ah! OK. In that case, maybe we should have something like:
if (REG_P (prev_rtx)
&& HARD_REGISTER_P (prev_rtx)
&& REG_CAN_CHANGE_MODE_P (REGNO (prev_rtx), prev->mode, mode))
prev_rtx = copy_to_reg (prev_rtx);
and then just have the single lowpart_subreg after that.
Thanks,
Richard
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Add QI vector mode support to by-pieces for memset
2021-07-20 14:26 ` Richard Sandiford
@ 2021-07-20 15:12 ` Richard Sandiford
2021-07-20 18:52 ` H.J. Lu
0 siblings, 1 reply; 14+ messages in thread
From: Richard Sandiford @ 2021-07-20 15:12 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches; +Cc: Jeff Law, H.J. Lu
Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>> On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford
>> <richard.sandiford@arm.com> wrote:
>>>
>>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
>>> >> > + {
>>> >> > + /* First generate subreg of word mode if the previous mode is
>>> >> > + wider than word mode and word mode is wider than MODE. */
>>> >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
>>> >> > + prev_mode, 0);
>>> >> > + prev_mode = word_mode;
>>> >> > + }
>>> >> > + if (prev_rtx != nullptr)
>>> >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
>>> >>
>>> >> This should be lowpart_subreg, since 0 isn't the right offset for
>>> >> big-endian targets. Using lowpart_subreg should also avoid the need
>>> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword
>>> >> subregs of multiword values.
>>> >
>>> > I tried it. It didn't work since it caused the LRA failure. I replaced
>>> > simplify_gen_subreg with lowpart_subreg instead.
>>>
>>> What specifically went wrong?
>>
>> With vector broadcast, for
>> ---
>> extern void *ops;
>>
>> void
>> foo (int c)
>> {
>> __builtin_memset (ops, c, 18);
>> }
>> ---
>> we generate HI from V16QI. With a single lowpart_subreg, I get
>>
>> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
>> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
>> *)ops.0_1]+16 S2 A8])
>> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
>> (nil))
>>
>> instead of
>>
>> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
>> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
>> *)ops.0_1]+16 S2 A8])
>> (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
>> (nil))
>>
>> IRA and LRA fail to reload:
>>
>> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
>> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
>> *)ops.0_1]+16 S2 A8])
>> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
>> (nil))
>>
>> since ix86_can_change_mode_class has
>>
>> if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
>> {
>> /* Vector registers do not support QI or HImode loads. If we don't
>> disallow a change to these modes, reload will assume it's ok to
>> drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
>> the vec_dupv4hi pattern. */
>> if (GET_MODE_SIZE (from) < 4)
>> return false;
>> }
>
> Ah! OK. In that case, maybe we should have something like:
>
> if (REG_P (prev_rtx)
> && HARD_REGISTER_P (prev_rtx)
> && REG_CAN_CHANGE_MODE_P (REGNO (prev_rtx), prev->mode, mode))
Sorry, make that last line:
&& lowpart_subreg_regno (REGNO (prev_rtx), prev->mode, mode) < 0
where lowpart_subreg_regno is a new wrapper around simplify_subreg_regno
that uses subreg_lowpart_offset (mode, prev->mode) as the offset.
Thanks,
Richard
> prev_rtx = copy_to_reg (prev_rtx);
>
> and then just have the single lowpart_subreg after that.
>
> Thanks,
> Richard
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH] Add QI vector mode support to by-pieces for memset
2021-07-20 15:12 ` Richard Sandiford
@ 2021-07-20 18:52 ` H.J. Lu
0 siblings, 0 replies; 14+ messages in thread
From: H.J. Lu @ 2021-07-20 18:52 UTC (permalink / raw)
To: H.J. Lu via Gcc-patches, Jeff Law, Richard Sandiford
On Tue, Jul 20, 2021 at 8:12 AM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Sandiford via Gcc-patches <gcc-patches@gcc.gnu.org> writes:
> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> On Mon, Jul 19, 2021 at 11:38 PM Richard Sandiford
> >> <richard.sandiford@arm.com> wrote:
> >>>
> >>> "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >>> >> > + {
> >>> >> > + /* First generate subreg of word mode if the previous mode is
> >>> >> > + wider than word mode and word mode is wider than MODE. */
> >>> >> > + prev_rtx = simplify_gen_subreg (word_mode, prev_rtx,
> >>> >> > + prev_mode, 0);
> >>> >> > + prev_mode = word_mode;
> >>> >> > + }
> >>> >> > + if (prev_rtx != nullptr)
> >>> >> > + target = simplify_gen_subreg (mode, prev_rtx, prev_mode, 0);
> >>> >>
> >>> >> This should be lowpart_subreg, since 0 isn't the right offset for
> >>> >> big-endian targets. Using lowpart_subreg should also avoid the need
> >>> >> for the word_size “if” above: lowpart_subreg can handle lowpart subword
> >>> >> subregs of multiword values.
> >>> >
> >>> > I tried it. It didn't work since it caused the LRA failure. I replaced
> >>> > simplify_gen_subreg with lowpart_subreg instead.
> >>>
> >>> What specifically went wrong?
> >>
> >> With vector broadcast, for
> >> ---
> >> extern void *ops;
> >>
> >> void
> >> foo (int c)
> >> {
> >> __builtin_memset (ops, c, 18);
> >> }
> >> ---
> >> we generate HI from V16QI. With a single lowpart_subreg, I get
> >>
> >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
> >> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
> >> *)ops.0_1]+16 S2 A8])
> >> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
> >> (nil))
> >>
> >> instead of
> >>
> >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
> >> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
> >> *)ops.0_1]+16 S2 A8])
> >> (subreg:HI (reg:DI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
> >> (nil))
> >>
> >> IRA and LRA fail to reload:
> >>
> >> (insn 10 9 0 2 (set (mem:HI (plus:DI (reg/f:DI 84)
> >> (const_int 16 [0x10])) [0 MEM <char[1:18]> [(void
> >> *)ops.0_1]+16 S2 A8])
> >> (subreg:HI (reg:V16QI 51 xmm15) 0)) "s2a.i":6:3 76 {*movhi_internal}
> >> (nil))
> >>
> >> since ix86_can_change_mode_class has
> >>
> >> if (MAYBE_SSE_CLASS_P (regclass) || MAYBE_MMX_CLASS_P (regclass))
> >> {
> >> /* Vector registers do not support QI or HImode loads. If we don't
> >> disallow a change to these modes, reload will assume it's ok to
> >> drop the subreg from (subreg:SI (reg:HI 100) 0). This affects
> >> the vec_dupv4hi pattern. */
> >> if (GET_MODE_SIZE (from) < 4)
> >> return false;
> >> }
> >
> > Ah! OK. In that case, maybe we should have something like:
> >
> > if (REG_P (prev_rtx)
> > && HARD_REGISTER_P (prev_rtx)
> > && REG_CAN_CHANGE_MODE_P (REGNO (prev_rtx), prev->mode, mode))
>
> Sorry, make that last line:
>
> && lowpart_subreg_regno (REGNO (prev_rtx), prev->mode, mode) < 0
>
> where lowpart_subreg_regno is a new wrapper around simplify_subreg_regno
> that uses subreg_lowpart_offset (mode, prev->mode) as the offset.
Fixed. I submitted the v3 patch:
https://gcc.gnu.org/pipermail/gcc-patches/2021-July/575670.html
Thanks.
> Thanks,
> Richard
>
> > prev_rtx = copy_to_reg (prev_rtx);
> >
> > and then just have the single lowpart_subreg after that.
> >
> > Thanks,
> > Richard
--
H.J.
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2021-07-20 18:52 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-07-13 21:49 [PATCH] Rewrite memset expanders with vec_duplicate H.J. Lu
2021-07-16 11:38 ` Richard Sandiford
2021-07-16 12:57 ` H.J. Lu
2021-07-16 13:23 ` Richard Sandiford
2021-07-16 14:15 ` H.J. Lu
2021-07-16 23:57 ` [PATCH] Add QI vector mode support to by-pieces for memset H.J. Lu
2021-07-19 14:40 ` Richard Sandiford
2021-07-20 2:43 ` H.J. Lu
2021-07-20 6:38 ` Richard Sandiford
2021-07-20 12:48 ` H.J. Lu
2021-07-20 13:10 ` H.J. Lu
2021-07-20 14:26 ` Richard Sandiford
2021-07-20 15:12 ` Richard Sandiford
2021-07-20 18:52 ` H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).