From: "H.J. Lu" <hjl.tools@gmail.com>
To: "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org>,
Richard Biener <richard.guenther@gmail.com>,
Jeff Law <jlaw@tachyum.com>,
Richard Sandiford <richard.sandiford@arm.com>
Subject: Re: [PATCH v3] Add QI vector mode support to by-pieces for memset
Date: Wed, 21 Jul 2021 13:02:59 -0700 [thread overview]
Message-ID: <CAMe9rOqhaPfMO=2eVhz5f_QH+HJ5LiVB_gznsOStXeSXJgBsRw@mail.gmail.com> (raw)
In-Reply-To: <mptpmvbtpvm.fsf@arm.com>
On Wed, Jul 21, 2021 at 12:42 PM Richard Sandiford
<richard.sandiford@arm.com> wrote:
>
> Richard Sandiford <richard.sandiford@arm.com> writes:
> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford
> >> <richard.sandiford@arm.com> wrote:
> >>>
> >>> "H.J. Lu" <hjl.tools@gmail.com> writes:
> >>> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> >>> > index 39ab139b7e1..1972301ce3c 100644
> >>> > --- a/gcc/builtins.c
> >>> > +++ b/gcc/builtins.c
> >>> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, machine_mode target_mode)
> >>> >
> >>> > static rtx
> >>> > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> >>> > - scalar_int_mode mode)
> >>> > + fixed_size_mode mode)
> >>> > {
> >>> > /* The REPresentation pointed to by DATA need not be a nul-terminated
> >>> > string but the caller guarantees it's large enough for MODE. */
> >>> > const char *rep = (const char *) data;
> >>> >
> >>> > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
> >>> > + /* NB: Vector mode in the by-pieces infrastructure is only used by
> >>> > + the memset expander. */
> >>>
> >>> Sorry to nitpick, but I guess this might get out out-of-date. Maybe:
> >>>
> >>> /* The by-pieces infrastructure does not try to pick a vector mode
> >>> for memcpy expansion. */
> >>
> >> Fixed.
> >>
> >>> > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
> >>> > + /*nul_terminated=*/false);
> >>> > }
> >>> >
> >>> > /* LEN specify length of the block of memcpy/memset operation.
> >>> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx)
> >>> >
> >>> > rtx
> >>> > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> >>> > - scalar_int_mode mode)
> >>> > + fixed_size_mode mode)
> >>> > {
> >>> > const char *str = (const char *) data;
> >>> >
> >>> > if ((unsigned HOST_WIDE_INT) offset > strlen (str))
> >>> > return const0_rtx;
> >>> >
> >>> > - return c_readstr (str + offset, mode);
> >>> > + /* NB: Vector mode in the by-pieces infrastructure is only used by
> >>> > + the memset expander. */
> >>>
> >>> Similarly here for strncpy expansion.
> >>
> >> Fixed.
> >>
> >>> > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
> >>> > }
> >>> >
> >>> > /* Helper to check the sizes of sequences and the destination of calls
> >>> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target)
> >>> > return NULL_RTX;
> >>> > }
> >>> >
> >>> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE)
> >>> > - bytes from constant string DATA + OFFSET and return it as target
> >>> > - constant. If PREV isn't nullptr, it has the RTL info from the
> >>> > +/* Return the RTL of a register in MODE generated from PREV in the
> >>> > previous iteration. */
> >>> >
> >>> > -rtx
> >>> > -builtin_memset_read_str (void *data, void *prevp,
> >>> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >>> > - scalar_int_mode mode)
> >>> > +static rtx
> >>> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode)
> >>> > {
> >>> > - by_pieces_prev *prev = (by_pieces_prev *) prevp;
> >>> > + rtx target = nullptr;
> >>> > if (prev != nullptr && prev->data != nullptr)
> >>> > {
> >>> > /* Use the previous data in the same mode. */
> >>> > if (prev->mode == mode)
> >>> > return prev->data;
> >>> > +
> >>> > + fixed_size_mode prev_mode = prev->mode;
> >>> > +
> >>> > + /* Don't use the previous data to write QImode if it is in a
> >>> > + vector mode. */
> >>> > + if (VECTOR_MODE_P (prev_mode) && mode == QImode)
> >>> > + return target;
> >>> > +
> >>> > + rtx prev_rtx = prev->data;
> >>> > +
> >>> > + if (REG_P (prev_rtx)
> >>> > + && HARD_REGISTER_P (prev_rtx)
> >>> > + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> >>> > + {
> >>> > + /* If we can't put a hard register in MODE, first generate a
> >>> > + subreg of word mode if the previous mode is wider than word
> >>> > + mode and word mode is wider than MODE. */
> >>> > + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode)
> >>> > + && UNITS_PER_WORD > GET_MODE_SIZE (mode))
> >>> > + {
> >>> > + prev_rtx = lowpart_subreg (word_mode, prev_rtx,
> >>> > + prev_mode);
> >>> > + if (prev_rtx != nullptr)
> >>> > + prev_mode = word_mode;
> >>> > + }
> >>> > + else
> >>> > + prev_rtx = nullptr;
> >>>
> >>> I don't understand this. Why not just do the:
> >>>
> >>> if (REG_P (prev_rtx)
> >>> && HARD_REGISTER_P (prev_rtx)
> >>> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> >>> prev_rtx = copy_to_reg (prev_rtx);
> >>>
> >>> that I suggested in the previous review?
> >>
> >> But for
> >> ---
> >> extern void *ops;
> >>
> >> void
> >> foo (int c)
> >> {
> >> __builtin_memset (ops, c, 18);
> >> }
> >> ---
> >> I got
> >>
> >> vpbroadcastb %edi, %xmm31
> >> vmovdqa64 %xmm31, -24(%rsp)
> >> movq ops(%rip), %rax
> >> movzwl -24(%rsp), %edx
> >> vmovdqu8 %xmm31, (%rax)
> >> movw %dx, 16(%rax)
> >> ret
> >>
> >> I want to avoid store and load. I am testing
> >>
> >> if (REG_P (prev_rtx)
> >> && HARD_REGISTER_P (prev_rtx)
> >> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> >> {
> >> /* Find the smallest subreg mode in the same mode class which
> >> is not narrower than MODE and narrower than PREV_MODE. */
> >> machine_mode m;
> >> fixed_size_mode candidate;
> >> FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode))
> >> if (is_a<fixed_size_mode> (m, &candidate))
> >> {
> >> if (GET_MODE_SIZE (candidate)
> >> >= GET_MODE_SIZE (prev_mode))
> >> break;
> >> if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode)
> >> && lowpart_subreg_regno (REGNO (prev_rtx),
> >> prev_mode, candidate) >= 0)
> >> {
> >> target = lowpart_subreg (candidate, prev_rtx,
> >> prev_mode);
> >> prev_rtx = target;
> >> prev_mode = candidate;
> >> break;
> >> }
> >> }
> >
> > That doesn't seem better though. There are still two subregs involved.
>
> Actually, I take that back. I guess it does make sense, and is
> definitely better than hard-coding word_mode. How about a comment
> along the lines of the following, instead of the one above:
>
> /* This case occurs when PREV_MODE is a vector and when
> MODE is too small to store using vector operations.
> After register allocation, the code will need to move the
> lowpart of the vector register into a non-vector register.
>
> Also, the target has chosen to use a hard register
> instead of going with the default choice of using a
> pseudo register. We should respect that choice and try to
> avoid creating a pseudo register with the same mode as the
> current hard register.
>
> In principle, we could just use a lowpart MODE subreg of
> the vector register. However, the vector register mode might
> be too wide for non-vector registers, and we already know
> that the non-vector mode is too small for vector registers.
> It's therefore likely that we'd need to spill to memory in
> the vector mode and reload the non-vector value from there.
>
> Try to avoid that by reducing the vector register to the
> smallest size that it can hold. This should increase the
> chances that non-vector registers can hold both the inner
> and outer modes of the subreg that we generate later. */
Fixed.
> Thanks,
> Richard
I sent the v4 patch.
Thanks.
--
H.J.
prev parent reply other threads:[~2021-07-21 20:03 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-07-20 18:50 H.J. Lu
2021-07-21 14:50 ` Richard Sandiford
2021-07-21 19:16 ` H.J. Lu
2021-07-21 19:20 ` Richard Sandiford
2021-07-21 19:31 ` H.J. Lu
2021-07-21 19:42 ` Richard Sandiford
2021-07-21 20:02 ` H.J. Lu [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAMe9rOqhaPfMO=2eVhz5f_QH+HJ5LiVB_gznsOStXeSXJgBsRw@mail.gmail.com' \
--to=hjl.tools@gmail.com \
--cc=gcc-patches@gcc.gnu.org \
--cc=jlaw@tachyum.com \
--cc=richard.guenther@gmail.com \
--cc=richard.sandiford@arm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).