From: "H.J. Lu" <hjl.tools@gmail.com>
To: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Cc: libc-alpha@sourceware.org,
Noah Goldstein <goldstein.w.n@gmail.com>,
Sajan Karumanchi <sajan.karumanchi@gmail.com>,
bmerry@sarao.ac.za, pmallapp@amd.com
Subject: Re: [PATCH v3 1/3] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994)
Date: Mon, 12 Feb 2024 07:56:09 -0800 [thread overview]
Message-ID: <CAMe9rOrVy97TibQY3dCq5puv5DW6akna4BNfF7wPax2O0+w8ew@mail.gmail.com> (raw)
In-Reply-To: <20240208130840.533348-2-adhemerval.zanella@linaro.org>
On Thu, Feb 8, 2024 at 5:08 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
> The REP MOVSB usage on memcpy/memmove does not show much performance
> improvement on Zen3/Zen4 cores compared to the vectorized loops. Also,
> as from BZ 30994, if the source is aligned and the destination is not
> the performance can be 20x slower.
>
> The performance difference is noticeable with small buffer sizes, closer
> to the lower bounds limits when memcpy/memmove starts to use ERMS. The
> performance of REP MOVSB is similar to vectorized instruction on the
> size limit (the L2 cache). Also, there is no drawback to multiple cores
> sharing the cache.
>
> Checked on x86_64-linux-gnu on Zen3.
> ---
> sysdeps/x86/dl-cacheinfo.h | 38 ++++++++++++++++++--------------------
> 1 file changed, 18 insertions(+), 20 deletions(-)
>
> diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
> index d5101615e3..f34d12846c 100644
> --- a/sysdeps/x86/dl-cacheinfo.h
> +++ b/sysdeps/x86/dl-cacheinfo.h
> @@ -791,7 +791,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> long int data = -1;
> long int shared = -1;
> long int shared_per_thread = -1;
> - long int core = -1;
> unsigned int threads = 0;
> unsigned long int level1_icache_size = -1;
> unsigned long int level1_icache_linesize = -1;
> @@ -809,7 +808,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> if (cpu_features->basic.kind == arch_kind_intel)
> {
> data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
> - core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
> shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
> shared_per_thread = shared;
>
> @@ -822,7 +820,8 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
> level1_dcache_linesize
> = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
> - level2_cache_size = core;
> + level2_cache_size
> + = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
> level2_cache_assoc
> = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
> level2_cache_linesize
> @@ -835,12 +834,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> level4_cache_size
> = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
>
> - get_common_cache_info (&shared, &shared_per_thread, &threads, core);
> + get_common_cache_info (&shared, &shared_per_thread, &threads,
> + level2_cache_size);
> }
> else if (cpu_features->basic.kind == arch_kind_zhaoxin)
> {
> data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
> - core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
> shared_per_thread = shared;
>
> @@ -849,19 +848,19 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> level1_dcache_size = data;
> level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
> level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
> - level2_cache_size = core;
> + level2_cache_size = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
> level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
> level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
> level3_cache_size = shared;
> level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
> level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
>
> - get_common_cache_info (&shared, &shared_per_thread, &threads, core);
> + get_common_cache_info (&shared, &shared_per_thread, &threads,
> + level2_cache_size);
> }
> else if (cpu_features->basic.kind == arch_kind_amd)
> {
> data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
> - core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
> shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
>
> level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
> @@ -869,7 +868,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> level1_dcache_size = data;
> level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
> level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
> - level2_cache_size = core;
> + level2_cache_size = handle_amd (_SC_LEVEL2_CACHE_SIZE);;
> level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
> level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
> level3_cache_size = shared;
> @@ -880,12 +879,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> if (shared <= 0)
> {
> /* No shared L3 cache. All we have is the L2 cache. */
> - shared = core;
> + shared = level2_cache_size;
> }
> else if (cpu_features->basic.family < 0x17)
> {
> /* Account for exclusive L2 and L3 caches. */
> - shared += core;
> + shared += level2_cache_size;
> }
>
> shared_per_thread = shared;
> @@ -987,6 +986,12 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
> rep_movsb_threshold = 2112;
>
> + /* For AMD CPUs that support ERMS (Zen3+), REP MOVSB is in a lot of
> + cases slower than the vectorized path (and for some alignments,
> + it is really slow, check BZ #30994). */
> + if (cpu_features->basic.kind == arch_kind_amd)
> + rep_movsb_threshold = non_temporal_threshold;
> +
> /* The default threshold to use Enhanced REP STOSB. */
> unsigned long int rep_stosb_threshold = 2048;
>
> @@ -1028,16 +1033,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
> SIZE_MAX);
>
> unsigned long int rep_movsb_stop_threshold;
> - /* ERMS feature is implemented from AMD Zen3 architecture and it is
> - performing poorly for data above L2 cache size. Henceforth, adding
> - an upper bound threshold parameter to limit the usage of Enhanced
> - REP MOVSB operations and setting its value to L2 cache size. */
> - if (cpu_features->basic.kind == arch_kind_amd)
> - rep_movsb_stop_threshold = core;
> /* Setting the upper bound of ERMS to the computed value of
> - non-temporal threshold for architectures other than AMD. */
> - else
> - rep_movsb_stop_threshold = non_temporal_threshold;
> + non-temporal threshold for all architectures. */
> + rep_movsb_stop_threshold = non_temporal_threshold;
>
> cpu_features->data_cache_size = data;
> cpu_features->shared_cache_size = shared;
> --
> 2.34.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
next prev parent reply other threads:[~2024-02-12 15:56 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-08 13:08 [PATCH v3 0/3] x86: Improve ERMS usage on Zen3+ Adhemerval Zanella
2024-02-08 13:08 ` [PATCH v3 1/3] x86: Fix Zen3/Zen4 ERMS selection (BZ 30994) Adhemerval Zanella
2024-02-12 15:56 ` H.J. Lu [this message]
2024-02-08 13:08 ` [PATCH v3 2/3] x86: Do not prefer ERMS for memset on Zen3+ Adhemerval Zanella
2024-02-12 15:56 ` H.J. Lu
2024-02-08 13:08 ` [PATCH v3 3/3] x86: Expand the comment on when REP STOSB is used on memset Adhemerval Zanella
2024-02-12 15:56 ` H.J. Lu
2024-03-25 15:15 ` [PATCH v3 0/3] x86: Improve ERMS usage on Zen3+ Florian Weimer
2024-03-25 15:19 ` H.J. Lu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAMe9rOrVy97TibQY3dCq5puv5DW6akna4BNfF7wPax2O0+w8ew@mail.gmail.com \
--to=hjl.tools@gmail.com \
--cc=adhemerval.zanella@linaro.org \
--cc=bmerry@sarao.ac.za \
--cc=goldstein.w.n@gmail.com \
--cc=libc-alpha@sourceware.org \
--cc=pmallapp@amd.com \
--cc=sajan.karumanchi@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).