public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] x86: Fix for cache computation on AMD legacy cpus.
@ 2023-06-02 13:19 sajan.karumanchi
  2023-06-02 16:51 ` Florian Weimer
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: sajan.karumanchi @ 2023-06-02 13:19 UTC (permalink / raw)
  To: libc-alpha, carlos, fweimer; +Cc: Sajan Karumanchi, premachandra.mallappa

From: Sajan Karumanchi <sajan.karumanchi@amd.com>

Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D'
set to Zero, thus resulting in zeroed-out computed cache values.
This patch reintroduces the old way of cache computation as a
failsafe option to handle these exceptions.

Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
---
 sysdeps/x86/dl-cacheinfo.h | 218 +++++++++++++++++++++++++++++++++----
 1 file changed, 194 insertions(+), 24 deletions(-)

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index 877e73d700..1d15b6bcd8 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -315,40 +315,203 @@ handle_amd (int name)
 {
   unsigned int eax;
   unsigned int ebx;
-  unsigned int ecx;
+  unsigned int ecx = 0;
   unsigned int edx;
-  unsigned int count = 0x1;
+  unsigned int max_cpuid = 0;
+  unsigned int fn = 0;
 
   /* No level 4 cache (yet).  */
   if (name > _SC_LEVEL3_CACHE_LINESIZE)
     return 0;
 
-  if (name >= _SC_LEVEL3_CACHE_SIZE)
-    count = 0x3;
-  else if (name >= _SC_LEVEL2_CACHE_SIZE)
-    count = 0x2;
-  else if (name >= _SC_LEVEL1_DCACHE_SIZE)
-    count = 0x0;
+  __cpuid(0x80000000, max_cpuid, ebx, ecx, edx);
+
+  if (max_cpuid >= 0x8000001D)
+    /* Use __cpuid__ '0x8000_001D' to compute cache details. */
+    {
+      unsigned int count = 0x1;
+
+      if (name >= _SC_LEVEL3_CACHE_SIZE)
+        count = 0x3;
+      else if (name >= _SC_LEVEL2_CACHE_SIZE)
+        count = 0x2;
+      else if (name >= _SC_LEVEL1_DCACHE_SIZE)
+        count = 0x0;
+
+      __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+
+      if (ecx != 0)
+        {
+          switch (name)
+            {
+            case _SC_LEVEL1_ICACHE_ASSOC:
+            case _SC_LEVEL1_DCACHE_ASSOC:
+            case _SC_LEVEL2_CACHE_ASSOC:
+            case _SC_LEVEL3_CACHE_ASSOC:
+              return ((ebx >> 22) & 0x3ff) + 1;
+            case _SC_LEVEL1_ICACHE_LINESIZE:
+            case _SC_LEVEL1_DCACHE_LINESIZE:
+            case _SC_LEVEL2_CACHE_LINESIZE:
+            case _SC_LEVEL3_CACHE_LINESIZE:
+              return (ebx & 0xfff) + 1;
+            case _SC_LEVEL1_ICACHE_SIZE:
+            case _SC_LEVEL1_DCACHE_SIZE:
+            case _SC_LEVEL2_CACHE_SIZE:
+            case _SC_LEVEL3_CACHE_SIZE:
+              return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
+            default:
+              __builtin_unreachable ();
+            }
+          return -1;
+        }
+    }
+
+  /* Legacy cache computation when __cpuid__ 0x8000_001D is Zero */
+  fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
+
+  if (max_cpuid < fn)
+    return 0;
+
+  __cpuid (fn, eax, ebx, ecx, edx);
 
-  __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+  if (name < _SC_LEVEL1_DCACHE_SIZE)
+    {
+      name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
+      ecx = edx;
+    }
 
   switch (name)
     {
-    case _SC_LEVEL1_ICACHE_ASSOC:
-    case _SC_LEVEL1_DCACHE_ASSOC:
-    case _SC_LEVEL2_CACHE_ASSOC:
+      case _SC_LEVEL1_DCACHE_SIZE:
+        return (ecx >> 14) & 0x3fc00;
+
+      case _SC_LEVEL1_DCACHE_ASSOC:
+        ecx >>= 16;
+        if ((ecx & 0xff) == 0xff)
+        {
+          /* Fully associative.  */
+          return (ecx << 2) & 0x3fc00;
+        }
+        return ecx & 0xff;
+
+      case _SC_LEVEL1_DCACHE_LINESIZE:
+        return ecx & 0xff;
+
+      case _SC_LEVEL2_CACHE_SIZE:
+        return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
+
+      case _SC_LEVEL2_CACHE_ASSOC:
+        switch ((ecx >> 12) & 0xf)
+          {
+            case 0:
+            case 1:
+            case 2:
+            case 4:
+              return (ecx >> 12) & 0xf;
+            case 6:
+              return 8;
+            case 8:
+              return 16;
+            case 10:
+              return 32;
+            case 11:
+              return 48;
+            case 12:
+              return 64;
+            case 13:
+              return 96;
+            case 14:
+              return 128;
+            case 15:
+              return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
+            default:
+              return 0;
+          }
+
+      case _SC_LEVEL2_CACHE_LINESIZE:
+        return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
+
+      case _SC_LEVEL3_CACHE_SIZE:
+        {
+        long int total_l3_cache = 0, l3_cache_per_thread = 0;
+        unsigned int threads = 0;
+        const struct cpu_features *cpu_features;
+
+        if ((edx & 0xf000) == 0)
+          return 0;
+
+        total_l3_cache = (edx & 0x3ffc0000) << 1;
+        cpu_features = __get_cpu_features ();
+
+        /* Figure out the number of logical threads that share L3.  */
+        if (max_cpuid >= 0x80000008)
+          {
+            /* Get width of APIC ID.  */
+            __cpuid (0x80000008, eax, ebx, ecx, edx);
+            threads = 1 << ((ecx >> 12) & 0x0f);
+          }
+
+        if (threads == 0 || cpu_features->basic.family >= 0x17)
+          {
+            /* If APIC ID width is not available, use logical
+            processor count.  */
+            __cpuid (0x00000001, eax, ebx, ecx, edx);
+            if ((edx & (1 << 28)) != 0)
+              threads = (ebx >> 16) & 0xff;
+          }
+
+        /* Cap usage of highest cache level to the number of
+           supported threads.  */
+        if (threads > 0)
+          l3_cache_per_thread = total_l3_cache/threads;
+
+        /* Get shared cache per ccx for Zen architectures.  */
+        if (cpu_features->basic.family >= 0x17)
+          {
+            long int l3_cache_per_ccx = 0;
+            /* Get number of threads share the L3 cache in CCX.  */
+            __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
+            unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
+            l3_cache_per_ccx = l3_cache_per_thread * threads_per_ccx;
+            return l3_cache_per_ccx;
+          }
+        else
+          {
+            return l3_cache_per_thread;
+          }
+      }
+
     case _SC_LEVEL3_CACHE_ASSOC:
-      return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0;
-    case _SC_LEVEL1_ICACHE_LINESIZE:
-    case _SC_LEVEL1_DCACHE_LINESIZE:
-    case _SC_LEVEL2_CACHE_LINESIZE:
+      switch ((edx >> 12) & 0xf)
+      {
+        case 0:
+        case 1:
+        case 2:
+        case 4:
+          return (edx >> 12) & 0xf;
+        case 6:
+          return 8;
+        case 8:
+          return 16;
+        case 10:
+          return 32;
+        case 11:
+          return 48;
+        case 12:
+          return 64;
+        case 13:
+          return 96;
+        case 14:
+          return 128;
+        case 15:
+          return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
+        default:
+          return 0;
+      }
+
     case _SC_LEVEL3_CACHE_LINESIZE:
-      return ecx ? (ebx & 0xfff) + 1 : 0;
-    case _SC_LEVEL1_ICACHE_SIZE:
-    case _SC_LEVEL1_DCACHE_SIZE:
-    case _SC_LEVEL2_CACHE_SIZE:
-    case _SC_LEVEL3_CACHE_SIZE:
-      return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0;
+      return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
+
     default:
       __builtin_unreachable ();
     }
@@ -713,8 +876,15 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
 
       if (shared <= 0)
-        /* No shared L3 cache.  All we have is the L2 cache.  */
-	shared = core;
+        {
+           /* No shared L3 cache.  All we have is the L2 cache.  */
+           shared = core;
+        }
+      else if (cpu_features->basic.family < 0x17)
+        {
+           /* Account for exclusive L2 and L3 caches.  */
+           shared += core;
+        }
     }
 
   cpu_features->level1_icache_size = level1_icache_size;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] x86: Fix for cache computation on AMD legacy cpus.
  2023-06-02 13:19 [PATCH] x86: Fix for cache computation on AMD legacy cpus sajan.karumanchi
@ 2023-06-02 16:51 ` Florian Weimer
  2023-06-05 18:59 ` Florian Weimer
  2023-08-01 15:20 ` PATCH v2] " sajan.karumanchi
  2 siblings, 0 replies; 8+ messages in thread
From: Florian Weimer @ 2023-06-02 16:51 UTC (permalink / raw)
  To: sajan.karumanchi--- via Libc-alpha
  Cc: carlos, fweimer, sajan.karumanchi, Sajan Karumanchi,
	premachandra.mallappa

* sajan karumanchi:

> From: Sajan Karumanchi <sajan.karumanchi@amd.com>
>
> Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D'
> set to Zero, thus resulting in zeroed-out computed cache values.
> This patch reintroduces the old way of cache computation as a
> failsafe option to handle these exceptions.
>
> Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>

Thanks.  On “AMD Turion(tm) II Neo N40L Dual-Core Processor”, this
fixes the regression.  The reported cache sizes are identical.

There's still one remaining issue (not a regression):

x86.cpu_features.level4_cache_size=0xffffffffffffffff

I think this should be zero if there is no L4 cache.

I'm going to test more machines early next week.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] x86: Fix for cache computation on AMD legacy cpus.
  2023-06-02 13:19 [PATCH] x86: Fix for cache computation on AMD legacy cpus sajan.karumanchi
  2023-06-02 16:51 ` Florian Weimer
@ 2023-06-05 18:59 ` Florian Weimer
  2023-06-06 13:42   ` Karumanchi, Sajan
  2023-08-01 15:20 ` PATCH v2] " sajan.karumanchi
  2 siblings, 1 reply; 8+ messages in thread
From: Florian Weimer @ 2023-06-05 18:59 UTC (permalink / raw)
  To: sajan.karumanchi
  Cc: libc-alpha, carlos, Sajan Karumanchi, premachandra.mallappa

* sajan karumanchi:

> From: Sajan Karumanchi <sajan.karumanchi@amd.com>
>
> Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D'
> set to Zero, thus resulting in zeroed-out computed cache values.
> This patch reintroduces the old way of cache computation as a
> failsafe option to handle these exceptions.
>
> Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>
> ---
>  sysdeps/x86/dl-cacheinfo.h | 218 +++++++++++++++++++++++++++++++++----
>  1 file changed, 194 insertions(+), 24 deletions(-)

On a Phenom II X6 1055T CPU, I see this difference compared to what we
had before:

@@ -212,7 +211,7 @@
 x86.cpu_features.level2_cache_size=0x80000
 x86.cpu_features.level2_cache_assoc=0x10
 x86.cpu_features.level2_cache_linesize=0x40
-x86.cpu_features.level3_cache_size=0x600000
+x86.cpu_features.level3_cache_size=0xc0000
 x86.cpu_features.level3_cache_assoc=0x30
 x86.cpu_features.level3_cache_linesize=0x40
 x86.cpu_features.level4_cache_size=0xffffffffffffffff

According to Wikipedia, L3 is shared, so I would have expected 0x100000
here with the correction factor.  The 0xc0000 number we had before seems
wrong, too.

On what appears to be a two-socket 12-core Magny-Cours prototype, I get
this:

@@ -212,7 +211,7 @@
 x86.cpu_features.level2_cache_size=0x80000
 x86.cpu_features.level2_cache_assoc=0x10
 x86.cpu_features.level2_cache_linesize=0x40
-x86.cpu_features.level3_cache_size=0xa00000
+x86.cpu_features.level3_cache_size=0xa0000
 x86.cpu_features.level3_cache_assoc=0x60
 x86.cpu_features.level3_cache_linesize=0x40
 x86.cpu_features.level4_cache_size=0xffffffffffffffff

According to Wikipedia, each socket has two chips with 6 MiB shared L3
cache each, which means that the system has a total of 24 MiB shared L3
cache.  With the per-thread correction factor, should we get 1 MiB L3
cache?

I don't know how far away either CPU is from the production silicon.
The Phenom CPU at least has the model name field filled in properly.

I'm not sure how to proceed here.  Should I try to get more complete
CPUID dumps?  The data that is used for cache computation mostly doesn't
make it into the --list-diagnostics output.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] x86: Fix for cache computation on AMD legacy cpus.
  2023-06-05 18:59 ` Florian Weimer
@ 2023-06-06 13:42   ` Karumanchi, Sajan
  0 siblings, 0 replies; 8+ messages in thread
From: Karumanchi, Sajan @ 2023-06-06 13:42 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, carlos, Mallappa, Premachandra

[AMD Official Use Only - General]

HI Florian

> -----Original Message-----
> From: Florian Weimer <fweimer@redhat.com>
> Sent: Tuesday, June 6, 2023 12:29 AM
> To: sajan.karumanchi@gmail.com
> Cc: libc-alpha@sourceware.org; carlos@redhat.com; Karumanchi, Sajan
> <Sajan.Karumanchi@amd.com>; Mallappa, Premachandra
> <Premachandra.Mallappa@amd.com>
> Subject: Re: [PATCH] x86: Fix for cache computation on AMD legacy cpus.
>
> Caution: This message originated from an External Source. Use proper caution
> when opening attachments, clicking links, or responding.
>
>
> * sajan karumanchi:
>
> > From: Sajan Karumanchi <sajan.karumanchi@amd.com>
> >
> > Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D'
> > set to Zero, thus resulting in zeroed-out computed cache values.
> > This patch reintroduces the old way of cache computation as a failsafe
> > option to handle these exceptions.
> >
> > Reviewed-by: Premachandra Mallappa
> <premachandra.mallappa@amd.com>
> > ---
> >  sysdeps/x86/dl-cacheinfo.h | 218
> > +++++++++++++++++++++++++++++++++----
> >  1 file changed, 194 insertions(+), 24 deletions(-)
>
> On a Phenom II X6 1055T CPU, I see this difference compared to what we had
> before:
>
> @@ -212,7 +211,7 @@
>  x86.cpu_features.level2_cache_size=0x80000
>  x86.cpu_features.level2_cache_assoc=0x10
>  x86.cpu_features.level2_cache_linesize=0x40
> -x86.cpu_features.level3_cache_size=0x600000
> +x86.cpu_features.level3_cache_size=0xc0000
>  x86.cpu_features.level3_cache_assoc=0x30
>  x86.cpu_features.level3_cache_linesize=0x40
>  x86.cpu_features.level4_cache_size=0xffffffffffffffff
>
> According to Wikipedia, L3 is shared, so I would have expected 0x100000 here
> with the correction factor.  The 0xc0000 number we had before seems wrong,
> too.
>
> On what appears to be a two-socket 12-core Magny-Cours prototype, I get
> this:
>
> @@ -212,7 +211,7 @@
>  x86.cpu_features.level2_cache_size=0x80000
>  x86.cpu_features.level2_cache_assoc=0x10
>  x86.cpu_features.level2_cache_linesize=0x40
> -x86.cpu_features.level3_cache_size=0xa00000
> +x86.cpu_features.level3_cache_size=0xa0000
>  x86.cpu_features.level3_cache_assoc=0x60
>  x86.cpu_features.level3_cache_linesize=0x40
>  x86.cpu_features.level4_cache_size=0xffffffffffffffff
>
> According to Wikipedia, each socket has two chips with 6 MiB shared L3 cache
> each, which means that the system has a total of 24 MiB shared L3 cache.
> With the per-thread correction factor, should we get 1 MiB L3 cache?
>
> I don't know how far away either CPU is from the production silicon.
> The Phenom CPU at least has the model name field filled in properly.
>
> I'm not sure how to proceed here.  Should I try to get more complete CPUID
> dumps?  The data that is used for cache computation mostly doesn't make it
> into the --list-diagnostics output.
>
> Thanks,
> Florian

Thank you for testing on the legacy machines.
It would be helpful if you could share the complete cupid dumps as we could not find these machines in our lab.

Thanks,
Sajan K.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* PATCH v2] x86: Fix for cache computation on AMD legacy cpus.
  2023-06-02 13:19 [PATCH] x86: Fix for cache computation on AMD legacy cpus sajan.karumanchi
  2023-06-02 16:51 ` Florian Weimer
  2023-06-05 18:59 ` Florian Weimer
@ 2023-08-01 15:20 ` sajan.karumanchi
  2023-08-01 15:20   ` [PATCH " sajan.karumanchi
  2 siblings, 1 reply; 8+ messages in thread
From: sajan.karumanchi @ 2023-08-01 15:20 UTC (permalink / raw)
  To: fweimer; +Cc: libc-alpha, premachandra.mallappa, carlos

* Florian,
> >> According to Wikipedia, each socket has two chips with 6 MiB shared
> >> L3 cache each, which means that the system has a total of 24 MiB
> >> shared L3
> cache.
> >> With the per-thread correction factor, should we get 1 MiB L3 cache?
From the raw CPUID dumps I see the identified(cupid tool) and
computed(glibc) L3 cache is 10MB.
In few Opteron models, only 5MB of chips L3 is visible with HT-Assist
feature enabled.
 "Magny-Cours models have 12 MB of L3 cache (2 × 6 MB) but only 10 MB
 is visible with the HT Assist feature activated using 2 MB as a directory cache."
> >>
> >> I don't know how far away either CPU is from the production silicon.
> >> The Phenom CPU at least has the model name field filled in properly.
> >>
> >> I'm not sure how to proceed here.  Should I try to get more
> >> complete CPUID dumps?  The data that is used for cache computation
> >> mostly doesn't make it into the --list-diagnostics output.
> >>
> >> Thanks,
> >> Florian
> >
> > Thank you for testing on the legacy machines.
> > It would be helpful if you could share the complete cupid dumps as
> > we could
> not find these machines in our lab.
>
> As discussed yesterday, these are the dumps I could produce.  Please
> let me know if those are what you were looking for.
>
> Thanks,
> Florian

The attached patch should now compute the cache details as expected for
legacy machines and also configure the L4 cache to Zero for all AMD CPUs.

Thanks,
Sajan


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] x86: Fix for cache computation on AMD legacy cpus.
  2023-08-01 15:20 ` PATCH v2] " sajan.karumanchi
@ 2023-08-01 15:20   ` sajan.karumanchi
  2023-08-02  1:36     ` Sergio Durigan Junior
  2023-08-02  6:43     ` Florian Weimer
  0 siblings, 2 replies; 8+ messages in thread
From: sajan.karumanchi @ 2023-08-01 15:20 UTC (permalink / raw)
  To: fweimer; +Cc: libc-alpha, premachandra.mallappa, carlos, Sajan Karumanchi

From: Sajan Karumanchi <sajan.karumanchi@amd.com>

Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D'
set to Zero, thus resulting in zeroed-out computed cache values.
This patch reintroduces the old way of cache computation as a
fail-safe option to handle these exceptions.
Fixed 'level4_cache_size' value through handle_amd().

Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index cd4d0351ae..285773039f 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -315,40 +315,206 @@ handle_amd (int name)
 {
   unsigned int eax;
   unsigned int ebx;
-  unsigned int ecx;
+  unsigned int ecx = 0;
   unsigned int edx;
-  unsigned int count = 0x1;
+  unsigned int max_cpuid = 0;
+  unsigned int fn = 0;
 
   /* No level 4 cache (yet).  */
   if (name > _SC_LEVEL3_CACHE_LINESIZE)
     return 0;
 
-  if (name >= _SC_LEVEL3_CACHE_SIZE)
-    count = 0x3;
-  else if (name >= _SC_LEVEL2_CACHE_SIZE)
-    count = 0x2;
-  else if (name >= _SC_LEVEL1_DCACHE_SIZE)
-    count = 0x0;
+  __cpuid (0x80000000, max_cpuid, ebx, ecx, edx);
+
+  if (max_cpuid >= 0x8000001D)
+    /* Use __cpuid__ '0x8000_001D' to compute cache details.  */
+    {
+      unsigned int count = 0x1;
+
+      if (name >= _SC_LEVEL3_CACHE_SIZE)
+        count = 0x3;
+      else if (name >= _SC_LEVEL2_CACHE_SIZE)
+        count = 0x2;
+      else if (name >= _SC_LEVEL1_DCACHE_SIZE)
+        count = 0x0;
+
+      __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+
+      if (ecx != 0)
+        {
+          switch (name)
+            {
+            case _SC_LEVEL1_ICACHE_ASSOC:
+            case _SC_LEVEL1_DCACHE_ASSOC:
+            case _SC_LEVEL2_CACHE_ASSOC:
+            case _SC_LEVEL3_CACHE_ASSOC:
+              return ((ebx >> 22) & 0x3ff) + 1;
+            case _SC_LEVEL1_ICACHE_LINESIZE:
+            case _SC_LEVEL1_DCACHE_LINESIZE:
+            case _SC_LEVEL2_CACHE_LINESIZE:
+            case _SC_LEVEL3_CACHE_LINESIZE:
+              return (ebx & 0xfff) + 1;
+            case _SC_LEVEL1_ICACHE_SIZE:
+            case _SC_LEVEL1_DCACHE_SIZE:
+            case _SC_LEVEL2_CACHE_SIZE:
+            case _SC_LEVEL3_CACHE_SIZE:
+              return (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1);
+            default:
+              __builtin_unreachable ();
+            }
+          return -1;
+        }
+    }
+
+  /* Legacy cache computation for CPUs prior to Bulldozer family.
+     This is also a fail-safe mechanism for some hypervisors that
+     accidentally configure __cpuid__ '0x8000_001D' to Zero.  */
 
-  __cpuid_count (0x8000001D, count, eax, ebx, ecx, edx);
+  fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
+
+  if (max_cpuid < fn)
+    return 0;
+
+  __cpuid (fn, eax, ebx, ecx, edx);
+
+  if (name < _SC_LEVEL1_DCACHE_SIZE)
+    {
+      name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
+      ecx = edx;
+    }
 
   switch (name)
     {
-    case _SC_LEVEL1_ICACHE_ASSOC:
-    case _SC_LEVEL1_DCACHE_ASSOC:
-    case _SC_LEVEL2_CACHE_ASSOC:
+      case _SC_LEVEL1_DCACHE_SIZE:
+        return (ecx >> 14) & 0x3fc00;
+
+      case _SC_LEVEL1_DCACHE_ASSOC:
+        ecx >>= 16;
+        if ((ecx & 0xff) == 0xff)
+        {
+          /* Fully associative.  */
+          return (ecx << 2) & 0x3fc00;
+        }
+        return ecx & 0xff;
+
+      case _SC_LEVEL1_DCACHE_LINESIZE:
+        return ecx & 0xff;
+
+      case _SC_LEVEL2_CACHE_SIZE:
+        return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
+
+      case _SC_LEVEL2_CACHE_ASSOC:
+        switch ((ecx >> 12) & 0xf)
+          {
+            case 0:
+            case 1:
+            case 2:
+            case 4:
+              return (ecx >> 12) & 0xf;
+            case 6:
+              return 8;
+            case 8:
+              return 16;
+            case 10:
+              return 32;
+            case 11:
+              return 48;
+            case 12:
+              return 64;
+            case 13:
+              return 96;
+            case 14:
+              return 128;
+            case 15:
+              return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
+            default:
+              return 0;
+          }
+
+      case _SC_LEVEL2_CACHE_LINESIZE:
+        return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
+
+      case _SC_LEVEL3_CACHE_SIZE:
+        {
+        long int total_l3_cache = 0, l3_cache_per_thread = 0;
+        unsigned int threads = 0;
+        const struct cpu_features *cpu_features;
+
+        if ((edx & 0xf000) == 0)
+          return 0;
+
+        total_l3_cache = (edx & 0x3ffc0000) << 1;
+        cpu_features = __get_cpu_features ();
+
+        /* Figure out the number of logical threads that share L3.  */
+        if (max_cpuid >= 0x80000008)
+          {
+            /* Get width of APIC ID.  */
+            __cpuid (0x80000008, eax, ebx, ecx, edx);
+            threads = (ecx & 0xff) + 1;
+          }
+
+        if (threads == 0)
+          {
+            /* If APIC ID width is not available, use logical
+            processor count.  */
+            __cpuid (0x00000001, eax, ebx, ecx, edx);
+            if ((edx & (1 << 28)) != 0)
+              threads = (ebx >> 16) & 0xff;
+          }
+
+        /* Cap usage of highest cache level to the number of
+           supported threads.  */
+        if (threads > 0)
+          l3_cache_per_thread = total_l3_cache/threads;
+
+        /* Get shared cache per ccx for Zen architectures.  */
+        if (cpu_features->basic.family >= 0x17)
+          {
+            long int l3_cache_per_ccx = 0;
+            /* Get number of threads share the L3 cache in CCX.  */
+            __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
+            unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
+            l3_cache_per_ccx = l3_cache_per_thread * threads_per_ccx;
+            return l3_cache_per_ccx;
+          }
+        else
+          {
+            return l3_cache_per_thread;
+          }
+      }
+
     case _SC_LEVEL3_CACHE_ASSOC:
-      return ecx ? ((ebx >> 22) & 0x3ff) + 1 : 0;
-    case _SC_LEVEL1_ICACHE_LINESIZE:
-    case _SC_LEVEL1_DCACHE_LINESIZE:
-    case _SC_LEVEL2_CACHE_LINESIZE:
+      switch ((edx >> 12) & 0xf)
+      {
+        case 0:
+        case 1:
+        case 2:
+        case 4:
+          return (edx >> 12) & 0xf;
+        case 6:
+          return 8;
+        case 8:
+          return 16;
+        case 10:
+          return 32;
+        case 11:
+          return 48;
+        case 12:
+          return 64;
+        case 13:
+          return 96;
+        case 14:
+          return 128;
+        case 15:
+          return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
+        default:
+          return 0;
+      }
+
     case _SC_LEVEL3_CACHE_LINESIZE:
-      return ecx ? (ebx & 0xfff) + 1 : 0;
-    case _SC_LEVEL1_ICACHE_SIZE:
-    case _SC_LEVEL1_DCACHE_SIZE:
-    case _SC_LEVEL2_CACHE_SIZE:
-    case _SC_LEVEL3_CACHE_SIZE:
-      return ecx ? (((ebx >> 22) & 0x3ff) + 1) * ((ebx & 0xfff) + 1) * (ecx + 1): 0;
+      return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
+
     default:
       __builtin_unreachable ();
     }
@@ -703,7 +869,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
       core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
-      shared_per_thread = shared;
 
       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
       level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
@@ -716,13 +881,20 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
       level3_cache_size = shared;
       level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
       level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
+      level4_cache_size = handle_amd (_SC_LEVEL4_CACHE_SIZE);
 
       if (shared <= 0)
-        /* No shared L3 cache.  All we have is the L2 cache.  */
-	shared = core;
+        {
+           /* No shared L3 cache.  All we have is the L2 cache.  */
+           shared = core;
+        }
+      else if (cpu_features->basic.family < 0x17)
+        {
+           /* Account for exclusive L2 and L3 caches.  */
+           shared += core;
+        }
 
-      if (shared_per_thread <= 0)
-	shared_per_thread = shared;
+      shared_per_thread = shared;
     }
 
   cpu_features->level1_icache_size = level1_icache_size;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] x86: Fix for cache computation on AMD legacy cpus.
  2023-08-01 15:20   ` [PATCH " sajan.karumanchi
@ 2023-08-02  1:36     ` Sergio Durigan Junior
  2023-08-02  6:43     ` Florian Weimer
  1 sibling, 0 replies; 8+ messages in thread
From: Sergio Durigan Junior @ 2023-08-02  1:36 UTC (permalink / raw)
  To: sajan.karumanchi
  Cc: fweimer, libc-alpha, premachandra.mallappa, carlos, Sajan Karumanchi

On Tuesday, August 01 2023, sajan karumanchi wrote:

> Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D'
> set to Zero, thus resulting in zeroed-out computed cache values.
> This patch reintroduces the old way of cache computation as a
> fail-safe option to handle these exceptions.
> Fixed 'level4_cache_size' value through handle_amd().
>
> Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>

Hi,

As previously discussed at
https://www.mail-archive.com/qemu-devel@nongnu.org/msg973967.html, this
is affecting QEMU.  More specifically, we're seeing QEMU segfault in
Ubuntu's autopkgtest infrastructure whenever it's used in a nested VM
environment, which is causing some headaches.

I backported the proposed patch to glibc 2.37 (the version currently
shipped in Ubuntu's development version) and verified that it seems to
fix the problem, so I thought it'd be worth mentioning it here.

Cheers,

-- 
Sergio
GPG key ID: 237A 54B1 0287 28BF 00EF  31F4 D0EB 7628 65FC 5E36
Please send encrypted e-mail if possible
https://sergiodj.net/

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] x86: Fix for cache computation on AMD legacy cpus.
  2023-08-01 15:20   ` [PATCH " sajan.karumanchi
  2023-08-02  1:36     ` Sergio Durigan Junior
@ 2023-08-02  6:43     ` Florian Weimer
  1 sibling, 0 replies; 8+ messages in thread
From: Florian Weimer @ 2023-08-02  6:43 UTC (permalink / raw)
  To: sajan.karumanchi
  Cc: libc-alpha, premachandra.mallappa, carlos, Sajan Karumanchi

* sajan karumanchi:

> From: Sajan Karumanchi <sajan.karumanchi@amd.com>
>
> Some legacy AMD CPUs and hypervisors have the _cpuid_ '0x8000_001D'
> set to Zero, thus resulting in zeroed-out computed cache values.
> This patch reintroduces the old way of cache computation as a
> fail-safe option to handle these exceptions.
> Fixed 'level4_cache_size' value through handle_amd().
>
> Reviewed-by: Premachandra Mallappa <premachandra.mallappa@amd.com>

I checked that the reported cache sizes are now back to what they were
before on a few older systems.  I think this should go in.

Tested-by: Florian Weimer <fweimer@redhat.com>

Sajan, do you want to backport this immediately to 2.38, or wait a bit?

Thanks,
Florian


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2023-08-02  6:43 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-02 13:19 [PATCH] x86: Fix for cache computation on AMD legacy cpus sajan.karumanchi
2023-06-02 16:51 ` Florian Weimer
2023-06-05 18:59 ` Florian Weimer
2023-06-06 13:42   ` Karumanchi, Sajan
2023-08-01 15:20 ` PATCH v2] " sajan.karumanchi
2023-08-01 15:20   ` [PATCH " sajan.karumanchi
2023-08-02  1:36     ` Sergio Durigan Junior
2023-08-02  6:43     ` Florian Weimer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).