public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Noah Goldstein <goldstein.w.n@gmail.com>
To: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Cc: libc-alpha@sourceware.org, "H . J . Lu" <hjl.tools@gmail.com>,
	 Sajan Karumanchi <sajan.karumanchi@gmail.com>,
	bmerry@sarao.ac.za, pmallapp@amd.com
Subject: Re: [PATCH v2 1/3] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
Date: Tue, 6 Feb 2024 18:36:47 +0000	[thread overview]
Message-ID: <CAFUsyfKp8Q0x6Cxj3Z1JXmg_jkWOKth31EJpC8SvSsQkxu-nsQ@mail.gmail.com> (raw)
In-Reply-To: <20240206174322.2317679-2-adhemerval.zanella@linaro.org>

On Tue, Feb 6, 2024 at 5:43 PM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
> The REP MOVSB usage on memcpy/memmove does not show much performance
> improvement on Zen3/Zen4 cores compared to the vectorized loops.  Also,
> as from BZ 30994, if the source is aligned and the destination is not
> the performance can be 20x slower.
>
> The performance difference is noticeable with small buffer sizes, closer
> to the lower bounds limits when memcpy/memmove starts to use ERMS.  The
> performance of REP MOVSB is similar to vectorized instruction on the
> size limit (the L2 cache).  Also, there is no drawback to multiple cores
> sharing the cache.
>
> A new tunable, glibc.cpu.x86_rep_movsb_stop_threshold, allows to set up
> the higher bound size to use 'rep movsb'.
>
> Checked on x86_64-linux-gnu on Zen3.
> ---
>  manual/tunables.texi         |  9 +++++++
>  sysdeps/x86/dl-cacheinfo.h   | 50 +++++++++++++++++++++---------------
>  sysdeps/x86/dl-tunables.list | 10 ++++++++
>  3 files changed, 48 insertions(+), 21 deletions(-)
>
> diff --git a/manual/tunables.texi b/manual/tunables.texi
> index be97190d67..ee5d90b91b 100644
> --- a/manual/tunables.texi
> +++ b/manual/tunables.texi
> @@ -569,6 +569,15 @@ greater than zero, and currently defaults to 2048 bytes.
>  This tunable is specific to i386 and x86-64.
>  @end deftp
>
> +@deftp Tunable glibc.cpu.x86_rep_movsb_stop_threshold
> +The @code{glibc.cpu.x86_rep_movsb_threshold} tunable allows the user to
> +set the threshold in bytes to stop using "rep movsb".  The value must be
> +greater than zero, and currently, the default depends on the CPU and the
> +cache size.
> +
> +This tunable is specific to i386 and x86-64.
> +@end deftp
> +
>  @deftp Tunable glibc.cpu.x86_rep_stosb_threshold
>  The @code{glibc.cpu.x86_rep_stosb_threshold} tunable allows the user to
>  set threshold in bytes to start using "rep stosb".  The value must be
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index d5101615e3..74b804c5e6 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    long int data = -1;
>    long int shared = -1;
>    long int shared_per_thread = -1;
> -  long int core = -1;
>    unsigned int threads = 0;
>    unsigned long int level1_icache_size = -1;
>    unsigned long int level1_icache_linesize = -1;
> @@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>    if (cpu_features->basic.kind == arch_kind_intel)
>      {
>        data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
> -      core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
>        shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
>        shared_per_thread = shared;
>
> @@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>         = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
>        level1_dcache_linesize
>         = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
> -      level2_cache_size = core;
> +      level2_cache_size
> +       = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
>        level2_cache_assoc
>         = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
>        level2_cache_linesize
> @@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>        level4_cache_size
>         = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
>
> -      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
> +      get_common_cache_info (&shared, &shared_per_thread, &threads,
> +                            level2_cache_size);
>      }
>    else if (cpu_features->basic.kind == arch_kind_zhaoxin)
>      {
>        data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> -      core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
>        shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
>        shared_per_thread = shared;
>
> @@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>        level1_dcache_size = data;
>        level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
>        level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
> -      level2_cache_size = core;
> +      level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
>        level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
>        level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
>        level3_cache_size = shared;
>        level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
>        level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
>
> -      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
> +      get_common_cache_info (&shared, &shared_per_thread, &threads,
> +                            level2_cache_size);
>      }
>    else if (cpu_features->basic.kind == arch_kind_amd)
>      {
>        data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
> -      core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
>        shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
>
>        level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
> @@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>        level1_dcache_size = data;
>        level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
>        level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
> -      level2_cache_size = core;
> +      level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
>        level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
>        level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
>        level3_cache_size = shared;
> @@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>        if (shared <= 0)
>          {
>             /* No shared L3 cache.  All we have is the L2 cache.  */
> -           shared = core;
> +           shared = level2_cache_size;
>          }
>        else if (cpu_features->basic.family < 0x17)
>          {
>             /* Account for exclusive L2 and L3 caches.  */
> -           shared += core;
> +           shared += level2_cache_size;
>          }
>
>        shared_per_thread = shared;
> @@ -1028,16 +1027,25 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
>                            SIZE_MAX);
>
>    unsigned long int rep_movsb_stop_threshold;
> -  /* ERMS feature is implemented from AMD Zen3 architecture and it is
> -     performing poorly for data above L2 cache size. Henceforth, adding
> -     an upper bound threshold parameter to limit the usage of Enhanced
> -     REP MOVSB operations and setting its value to L2 cache size.  */
> -  if (cpu_features->basic.kind == arch_kind_amd)
> -    rep_movsb_stop_threshold = core;
> -  /* Setting the upper bound of ERMS to the computed value of
> -     non-temporal threshold for architectures other than AMD.  */
> -  else
> -    rep_movsb_stop_threshold = non_temporal_threshold;
> +  /* If the tunable is set and with a valid value (larger than the minimal
> +     threshold to use ERMS) use it instead of default values.  */
> +  rep_movsb_stop_threshold = TUNABLE_GET (x86_rep_movsb_stop_threshold,
> +                                         long int, NULL);
> +  if (!TUNABLE_IS_INITIALIZED (x86_rep_movsb_stop_threshold)
> +      || rep_movsb_stop_threshold <= rep_movsb_threshold)
> +    {
> +      /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> +        cases slower than the vectorized path (and for some alignments,
> +        it is really slow, check BZ #30994).  */
> +      if (cpu_features->basic.kind == arch_kind_amd)
> +       rep_movsb_stop_threshold = 0;
note that if `size >= rep_movsb_threshold && size >= rep_movsb_stop_threshold`
we will use NT stores, not temporal stores.

Id think you would want this to be setting the
`rep_movsb_threshold` -> `non_temporal_threshold`
which would essentially disable `rep movsb` but continue to
use the other tunables for temporal/non-temporal decisions.
> +      else
> +      /* Setting the upper bound of ERMS to the computed value of
> +        non-temporal threshold for architectures other than AMD.  */
> +       rep_movsb_stop_threshold = non_temporal_threshold;
> +    }
> +  TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
> +                          SIZE_MAX);
>
>    cpu_features->data_cache_size = data;
>    cpu_features->shared_cache_size = shared;
> diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
> index 7d82da0dec..80cf5563ab 100644
> --- a/sysdeps/x86/dl-tunables.list
> +++ b/sysdeps/x86/dl-tunables.list
> @@ -49,6 +49,16 @@ glibc {
>        # if the tunable value is set by user or not [BZ #27069].
>        minval: 1
>      }
> +    x86_rep_movsb_stop_threshold {
> +      # For AMD CPUs that support ERMS (Zen3+), REP MOVSB is not faster
> +      # than the vectorized path (and for some destination alignment it
> +      # is really slow, check BZ #30994).  On Intel CPUs, the size limit
> +      # to use ERMS is [1/8, 1/2] of the size of the chip's cache, check
> +      # the dl-cacheinfo.h).
> +      # This tunable allows the caller to set the limit where to use REP
> +      # MOVB on memcpy/memmove.
> +      type: SIZE_T
> +    }
>      x86_rep_stosb_threshold {
>        type: SIZE_T
>        # Since there is overhead to set up REP STOSB operation, REP STOSB
> --
> 2.34.1
>

  reply	other threads:[~2024-02-06 18:36 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-06 17:43 [PATCH v2 0/3] x86: Improve ERMS usage on Zen3+ Adhemerval Zanella
2024-02-06 17:43 ` [PATCH v2 1/3] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994) Adhemerval Zanella
2024-02-06 18:36   ` Noah Goldstein [this message]
2024-02-07 12:10     ` Adhemerval Zanella Netto
2024-02-07 17:39       ` Noah Goldstein
2024-02-07 18:06         ` Adhemerval Zanella Netto
2024-02-07 18:10           ` Adhemerval Zanella Netto
2024-02-07 22:07             ` Noah Goldstein
2024-02-09 10:59               ` Karumanchi, Sajan
2024-02-06 17:43 ` [PATCH v2 2/3] x86: Do not prefer ERMS for memset on Zen3+ Adhemerval Zanella
2024-02-06 19:01   ` Noah Goldstein
2024-02-07 12:14     ` Adhemerval Zanella Netto
2024-02-06 17:43 ` [PATCH v2 3/3] x86: Expand the comment on when REP STOSB is used on memset Adhemerval Zanella

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAFUsyfKp8Q0x6Cxj3Z1JXmg_jkWOKth31EJpC8SvSsQkxu-nsQ@mail.gmail.com \
    --to=goldstein.w.n@gmail.com \
    --cc=adhemerval.zanella@linaro.org \
    --cc=bmerry@sarao.ac.za \
    --cc=hjl.tools@gmail.com \
    --cc=libc-alpha@sourceware.org \
    --cc=pmallapp@amd.com \
    --cc=sajan.karumanchi@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).