Adhemerval Zanella via Libc-alpha writes: > A new tunable, 'glibc.malloc.mmap_hugetlb', adds support to use Huge Page > support directly with mmap() calls. The required supported sizes and > flags for mmap() are provided by an arch-specific internal hook > malloc_hp_config(). > > Currently it first try mmap() using the huge page size and fallback to > default page size and sbrk() call if kernel returns MMAP_FAILED. > > The default malloc_hp_config() implementation does not enable it even > if the tunable is set. > > Checked on x86_64-linux-gnu. > --- > NEWS | 4 + > elf/dl-tunables.list | 4 + > elf/tst-rtld-list-tunables.exp | 1 + > malloc/arena.c | 2 + > malloc/malloc.c | 35 +++++- > manual/tunables.texi | 14 +++ > sysdeps/generic/malloc-hugepages.c | 6 + > sysdeps/generic/malloc-hugepages.h | 12 ++ > sysdeps/unix/sysv/linux/malloc-hugepages.c | 125 +++++++++++++++++++++ > 9 files changed, 200 insertions(+), 3 deletions(-) > > diff --git a/NEWS b/NEWS > index 9b2345d08c..412bf3e6f8 100644 > --- a/NEWS > +++ b/NEWS > @@ -14,6 +14,10 @@ Major new features: > It might improve performance with Transparent Huge Pages madvise mode > depending of the workload. > > +* On Linux, a new tunable, glibc.malloc.mmap_hugetlb, can be used to > + instruct malloc to try use Huge Pages when allocate memory with mmap() > + calls (through the use of MAP_HUGETLB). > + > Deprecated and removed features, and other changes affecting compatibility: > > [Add deprecations, removals and changes affecting compatibility here] > diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list > index 67df6dbc2c..209c2d8592 100644 > --- a/elf/dl-tunables.list > +++ b/elf/dl-tunables.list > @@ -97,6 +97,10 @@ glibc { > minval: 0 > maxval: 1 > } > + mmap_hugetlb { > + type: SIZE_T > + minval: 0 > + } > } > cpu { > hwcap_mask { > diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp > index d8109fa31c..49f033ce91 100644 > --- a/elf/tst-rtld-list-tunables.exp > +++ b/elf/tst-rtld-list-tunables.exp > @@ -1,6 +1,7 @@ > glibc.malloc.arena_max: 0x0 (min: 0x1, max: 0x[f]+) > glibc.malloc.arena_test: 0x0 (min: 0x1, max: 0x[f]+) > glibc.malloc.check: 0 (min: 0, max: 3) > +glibc.malloc.mmap_hugetlb: 0x0 (min: 0x0, max: 0x[f]+) > glibc.malloc.mmap_max: 0 (min: 0, max: 2147483647) > glibc.malloc.mmap_threshold: 0x0 (min: 0x0, max: 0x[f]+) > glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0x[f]+) > diff --git a/malloc/arena.c b/malloc/arena.c > index 81bff54303..4efb5581c1 100644 > --- a/malloc/arena.c > +++ b/malloc/arena.c > @@ -232,6 +232,7 @@ TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t) > #endif > TUNABLE_CALLBACK_FNDECL (set_mxfast, size_t) > TUNABLE_CALLBACK_FNDECL (set_thp_madvise, int32_t) > +TUNABLE_CALLBACK_FNDECL (set_mmap_hugetlb, size_t) > #else > /* Initialization routine. */ > #include > @@ -333,6 +334,7 @@ ptmalloc_init (void) > # endif > TUNABLE_GET (mxfast, size_t, TUNABLE_CALLBACK (set_mxfast)); > TUNABLE_GET (thp_madvise, int32_t, TUNABLE_CALLBACK (set_thp_madvise)); > + TUNABLE_GET (mmap_hugetlb, size_t, TUNABLE_CALLBACK (set_mmap_hugetlb)); > #else > if (__glibc_likely (_environ != NULL)) > { > diff --git a/malloc/malloc.c b/malloc/malloc.c > index 4bfcea286f..8cf2d6855e 100644 > --- a/malloc/malloc.c > +++ b/malloc/malloc.c > @@ -1884,6 +1884,10 @@ struct malloc_par > #if HAVE_TUNABLES > /* Transparent Large Page support. */ > INTERNAL_SIZE_T thp_pagesize; > + /* A value different than 0 means to align mmap allocation to hp_pagesize > + add hp_flags on flags. */ > + INTERNAL_SIZE_T hp_pagesize; > + int hp_flags; > #endif > > /* Memory map support */ > @@ -2415,7 +2419,8 @@ do_check_malloc_state (mstate av) > */ > > static void * > -sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av) > +sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av, > + bool set_thp) > { > long int size; > > @@ -2442,7 +2447,8 @@ sysmalloc_mmap (INTERNAL_SIZE_T nb, size_t pagesize, int extra_flags, mstate av) > if (mm == MAP_FAILED) > return mm; > > - sysmadvise_thp (mm, size); > + if (set_thp) > + sysmadvise_thp (mm, size); > > /* > The offset to the start of the mmapped region is stored in the prev_size > @@ -2531,7 +2537,18 @@ sysmalloc (INTERNAL_SIZE_T nb, mstate av) > && (mp_.n_mmaps < mp_.n_mmaps_max))) > { > try_mmap: > - char *mm = sysmalloc_mmap (nb, pagesize, 0, av); > + char *mm; > +#if HAVE_TUNABLES > + if (mp_.hp_pagesize > 0) > + { > + /* There is no need to isse the THP madvise call if Huge Pages are > + used directly. */ > + mm = sysmalloc_mmap (nb, mp_.hp_pagesize, mp_.hp_flags, av, false); > + if (mm != MAP_FAILED) > + return mm; > + } > +#endif > + mm = sysmalloc_mmap (nb, pagesize, 0, av, true); > if (mm != MAP_FAILED) > return mm; > tried_mmap = true; > @@ -5405,6 +5422,18 @@ do_set_thp_madvise (int32_t value) > } > return 0; > } > + > +static __always_inline int > +do_set_mmap_hugetlb (size_t value) > +{ > + if (value > 0) > + { > + struct malloc_hugepage_config_t cfg = __malloc_hugepage_config (value); > + mp_.hp_pagesize = cfg.pagesize; > + mp_.hp_flags = cfg.flags; > + } > + return 0; > +} > #endif > > int > diff --git a/manual/tunables.texi b/manual/tunables.texi > index 93c46807f9..4da6a02778 100644 > --- a/manual/tunables.texi > +++ b/manual/tunables.texi > @@ -279,6 +279,20 @@ The default value of this tunable is @code{0}, which disable its usage. > Setting to a positive value enable the @code{madvise} call. > @end deftp > > +@deftp Tunable glibc.malloc.mmap_hugetlb > +This tunable enable the use of Huge Pages when the system supports it (currently > +only Linux). It is done by aligning the memory size and passing the required > +flags (@code{MAP_HUGETLB} on Linux) when issuing the @code{mmap} to allocate > +memory from the system. > + > +The default value of this tunable is @code{0}, which disable its usage. > +The special value @code{1} will try to gather the system default huge page size, > +while a value larger than @code{1} will try to match it with the supported system > +huge page size. If either no default huge page size could be obtained or if the > +requested size does not match the supported ones, the huge pages supports will be > +disabled. > +@end deftp > + > @node Dynamic Linking Tunables > @section Dynamic Linking Tunables > @cindex dynamic linking tunables > diff --git a/sysdeps/generic/malloc-hugepages.c b/sysdeps/generic/malloc-hugepages.c > index 262bcdbeb8..e5f5c1ec98 100644 > --- a/sysdeps/generic/malloc-hugepages.c > +++ b/sysdeps/generic/malloc-hugepages.c > @@ -29,3 +29,9 @@ __malloc_thp_mode (void) > { > return malloc_thp_mode_not_supported; > } > + > +/* Return the default transparent huge page size. */ > +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested) > +{ > + return (struct malloc_hugepage_config_t) { 0, 0 }; > +} > diff --git a/sysdeps/generic/malloc-hugepages.h b/sysdeps/generic/malloc-hugepages.h > index 664cda9b67..27f7adfea5 100644 > --- a/sysdeps/generic/malloc-hugepages.h > +++ b/sysdeps/generic/malloc-hugepages.h > @@ -34,4 +34,16 @@ enum malloc_thp_mode_t > > enum malloc_thp_mode_t __malloc_thp_mode (void) attribute_hidden; > > +struct malloc_hugepage_config_t > +{ > + size_t pagesize; > + int flags; > +}; > + > +/* Returned the support huge page size from the requested PAGESIZE along > + with the requires extra mmap flags. Returning a 0 value for pagesize > + disables its usage. */ > +struct malloc_hugepage_config_t __malloc_hugepage_config (size_t requested) > + attribute_hidden; > + > #endif /* _MALLOC_HUGEPAGES_H */ > diff --git a/sysdeps/unix/sysv/linux/malloc-hugepages.c b/sysdeps/unix/sysv/linux/malloc-hugepages.c > index 66589127cd..0eb0c764ad 100644 > --- a/sysdeps/unix/sysv/linux/malloc-hugepages.c > +++ b/sysdeps/unix/sysv/linux/malloc-hugepages.c > @@ -17,8 +17,10 @@ > not, see . */ > > #include > +#include > #include > #include > +#include > > size_t > __malloc_default_thp_pagesize (void) > @@ -74,3 +76,126 @@ __malloc_thp_mode (void) > } > return malloc_thp_mode_not_supported; > } > + > +static size_t > +malloc_default_hugepage_size (void) > +{ > + int fd = __open64_nocancel ("/proc/meminfo", O_RDONLY); > + if (fd == -1) > + return 0; > + > + char buf[512]; > + off64_t off = 0; > + while (1) > + { > + ssize_t r = __pread64_nocancel (fd, buf, sizeof (buf) - 1, off); > + if (r < 0) > + break; > + buf[r - 1] = '\0'; > + > + const char *s = strstr (buf, "Hugepagesize:"); > + if (s == NULL) > + { > + char *nl = strrchr (buf, '\n'); > + if (nl == NULL) > + break; > + off += (nl + 1) - buf; > + continue; > + } > + > + /* The default huge page size is in the form: > + Hugepagesize: NUMBER kB */ > + size_t hpsize = 0; > + s += sizeof ("Hugepagesize: ") - 1; > + for (int i = 0; (s[i] >= '0' && s[i] <= '9') || s[i] == ' '; i++) > + { > + if (s[i] == ' ') > + continue; > + hpsize *= 10; > + hpsize += s[i] - '0'; > + } > + return hpsize * 1024; > + } > + > + __close_nocancel (fd); > + > + return 0; > +} > + > +static inline struct malloc_hugepage_config_t > +make_malloc_hugepage_config (size_t pagesize) > +{ > + int flags = MAP_HUGETLB | (__builtin_ctzll (pagesize) << MAP_HUGE_SHIFT); > + return (struct malloc_hugepage_config_t) { pagesize, flags }; > +} > + > +struct malloc_hugepage_config_t > +__malloc_hugepage_config (size_t requested) > +{ > + if (requested == 1) > + { > + size_t pagesize = malloc_default_hugepage_size (); > + if (pagesize != 0) > + return make_malloc_hugepage_config (pagesize); > + } > + > + int dirfd = __open64_nocancel ("/sys/kernel/mm/hugepages", > + O_RDONLY | O_DIRECTORY, 0); > + if (dirfd == -1) > + return (struct malloc_hugepage_config_t) { 0, 0 }; > + > + bool found = false; > + > + char buffer[1024]; > + while (true) > + { > +#if !IS_IN(libc) > +# define __getdents64 getdents64 > +#endif > + ssize_t ret = __getdents64 (dirfd, buffer, sizeof (buffer)); > + if (ret == -1) > + break; > + else if (ret == 0) > + break; > + > + char *begin = buffer, *end = buffer + ret; > + while (begin != end) > + { > + unsigned short int d_reclen; > + memcpy (&d_reclen, begin + offsetof (struct dirent64, d_reclen), > + sizeof (d_reclen)); > + const char *dname = begin + offsetof (struct dirent64, d_name); > + begin += d_reclen; > + > + if (dname[0] == '.' > + || strncmp (dname, "hugepages-", sizeof ("hugepages-") - 1) != 0) > + continue; > + > + /* Each entry represents a supported huge page in the form of: > + hugepages-kB. */ > + size_t hpsize = 0; > + const char *sizestr = dname + sizeof ("hugepages-") - 1; > + for (int i = 0; sizestr[i] >= '0' && sizestr[i] <= '9'; i++) > + { > + hpsize *= 10; > + hpsize += sizestr[i] - '0'; > + } > + hpsize *= 1024; > + > + if (hpsize == requested) > + { > + found = true; > + break; > + } > + } > + if (found) > + break; > + } > + > + __close_nocancel (dirfd); > + > + if (found) > + return make_malloc_hugepage_config (requested); > + > + return (struct malloc_hugepage_config_t) { 0, 0 }; > +} Hi Adhemerval, I tested this patchset on a POWER9, and I'm seeing the following test failures when running make check with glibc.malloc.mmap_hugetlb=1: malloc/tst-free-errno malloc/tst-free-errno-malloc-check malloc/tst-free-errno-mcheck posix/tst-exec posix/tst-exec-static posix/tst-spawn posix/tst-spawn-static posix/tst-spawn5 I'm attaching a summary of the contents of the .out files for each test.