On 6/25/22 21:44, Mark Wielaard wrote: > Hi Martin, Hello. > > Could you resent this patch with git send-email or rebase it against > current master. As is it doesn't apply giving errors when trying to > use it with git am. Sure, rebased and attached to this email. > > BTW. One of the reasons that I haven't reviewed things before is > because I am getting one or more testsuite FAILs with newer Fedora > versions. Which makes checking for regressions somewhat hard. > > On Wed, Jan 05, 2022 at 09:17:37AM +0100, Martin Liška wrote: >> The algorithm provides the following speed-up with -O2: >> >> - 1/5: sysdig (60M) >> dwz_O2 : 9.7 >> dwz_O2_xxhash : 8.5 (87.7%) >> - 2/5: rtags (58M) >> dwz_O2 : 17.6 >> dwz_O2_xxhash : 15.8 (89.5%) >> - 3/5: libetonyek (91M) >> dwz_O2 : 10.8 >> dwz_O2_xxhash : 9.4 (86.7%) >> - 4/5: krita (685M) >> dwz_O2 : 96.0 >> dwz_O2_xxhash : 85.6 (89.1%) >> - 5/5: gcc (1.2G) >> dwz_O2 : 58.6 >> dwz_O2_xxhash : 54.1 (92.4%) > > So a speedup of ~10%. Nice. > >> --- >> Makefile | 4 +- >> dwz.c | 103 +++++++++++++++++++++++----------------- >> hashtab.c | 139 ------------------------------------------------------ >> hashtab.h | 6 ++- >> 4 files changed, 65 insertions(+), 187 deletions(-) > > We probably also need a configure change to detect whether xxhash is > actually available. With my patch, one will see a compilation error. Can you please help me with the configure detection? > >> diff --git a/Makefile b/Makefile >> index 9394ef4..89546c2 100644 >> --- a/Makefile >> +++ b/Makefile >> @@ -8,7 +8,7 @@ CFLAGS = -O2 -g >> DWZ_VERSION := $(shell cat $(srcdir)/VERSION) >> CFLAGS_VERSION = -DDWZ_VERSION='"$(DWZ_VERSION)"' >> CFLAGS_COPYRIGHT = $(shell cat $(srcdir)/COPYRIGHT_YEARS) >> -CFLAGS_COMMON = -Wall -W -D_FILE_OFFSET_BITS=64 >> +CFLAGS_COMMON = -Wall -W -D_FILE_OFFSET_BITS=64 -DXXH_INLINE_ALL=1 >> override CFLAGS += $(CFLAGS_COMMON) $(CFLAGS_VERSION) $(CFLAGS_COPYRIGHT) >> prefix = /usr >> @@ -17,7 +17,7 @@ bindir = $(exec_prefix)/bin >> datarootdir = $(prefix)/share >> mandir = $(datarootdir)/man >> OBJECTS = args.o dwz.o hashtab.o pool.o sha1.o dwarfnames.o >> -LIBS=-lelf >> +LIBS=-lelf -lxxhash >> dwz: $(OBJECTS) >> $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) >> args.o: native.o > > So we use XXH_INLINE_ALL. Do we then still need to link with -lxxhash? Correct, it's not needed. > >> diff --git a/dwz.c b/dwz.c >> index a3b289f..cc68f4a 100644 >> --- a/dwz.c >> +++ b/dwz.c >> @@ -112,6 +112,21 @@ >> # define NT_GNU_BUILD_ID 3 >> #endif >> +/* xxHash state object. */ >> +static XXH64_state_t state; >> + >> +/* Clear xxHash state to zero. */ >> +#define hash_init_state() XXH64_reset(&state, 0) >> + >> +/* Update hash STATE with VALUE. */ >> +#define hash_update_state(value) XXH64_update(&state, &value, sizeof value) >> + >> +/* Update hash STATE with OBJECT that has a provided SIZE. */ >> +#define hash_update_state_object(object, size) XXH64_update(&state, object, size) >> + >> +/* Get digest once we are done with a state. */ >> +#define hash_digest() XXH64_digest(&state) > > Allocating and reusing the global state is convenient, but might make > parellizing dwz somewhat more complicated. Was this design used > deliberately, or can we easily switch to stack allocating the state? Yes, but the alternative approach would be making it a stack variable in each function where we use it. Note right now, processes are used for parallel execution. > > Also I find the naming slightly confusing. We used > iterative_hash_object to update a hash with an integral value, and > iterative_hash to update a hash with a size plus pointer. Now it seems > we swap the meaning. iterative_hash gets replace by > hash_update_state_object. And iterative_hash_object gets replaced by > hash_update_state. Was the meaning of "object" deliberately swapped? No, fixed that. > > Also it seems iterative_hash_object with inital zero value is replaced > by hash, which isn't defined here but in hashtab.h. Why the split in > definitions? All moved to dwz.c right now. > >> /* Print memory amount M (in kb) in both exact and human readable, like so: >> 1382508 (1.3G). */ >> static void >> @@ -1187,16 +1202,18 @@ compute_abbrev_hash (struct abbrev_tag *t) >> { >> unsigned int i; >> - t->hash = iterative_hash_object (t->tag, 0); >> - t->hash = iterative_hash_object (t->nattr, t->hash); >> - t->hash = iterative_hash_object (t->children, t->hash); >> + hash_init_state (); >> + hash_update_state (t->tag); >> + hash_update_state (t->nattr); >> + hash_update_state (t->children); >> for (i = 0; i < t->nattr; i++) >> { >> - t->hash = iterative_hash_object (t->attr[i].attr, t->hash); >> - t->hash = iterative_hash_object (t->attr[i].form, t->hash); >> + hash_update_state (t->attr[i].attr); >> + hash_update_state (t->attr[i].form); >> if (t->attr[i].form == DW_FORM_implicit_const) >> - t->hash = iterative_hash_object (t->values[i], t->hash); >> + hash_update_state (t->values[i]); >> } >> + t->hash = hash_digest (); >> } > > OK. > >> /* Maximum number of attributes in a DIE. */ >> @@ -3402,22 +3419,17 @@ checksum_die (DSO *dso, dw_cu_ref cu, dw_die_ref top_die, dw_die_ref die) >> struct dw_file *cu_file = &cu->cu_files[value - 1]; >> size_t file_len = strlen (cu_file->file); >> s = t->attr[i].attr; >> - die->u.p1.die_hash >> - = iterative_hash_object (s, die->u.p1.die_hash); >> - die->u.p1.die_hash >> - = iterative_hash_object (cu_file->time, >> - die->u.p1.die_hash); >> - die->u.p1.die_hash >> - = iterative_hash_object (cu_file->size, >> - die->u.p1.die_hash); >> - die->u.p1.die_hash >> - = iterative_hash (cu_file->file, file_len + 1, >> - die->u.p1.die_hash); >> + hash_init_state (); >> + hash_update_state (die->u.p1.die_hash); >> + hash_update_state (s); >> + hash_update_state (cu_file->time); >> + hash_update_state (cu_file->size); >> + hash_update_state_object (cu_file->file, file_len + 1); > > OK. > >> if (cu_file->dir) >> - die->u.p1.die_hash >> - = iterative_hash (cu_file->dir, >> - strlen (cu_file->dir) + 1, >> - die->u.p1.die_hash); >> + { >> + hash_update_state_object (cu_file->dir, >> + strlen (cu_file->dir) + 1); >> + } > > OK, assuming the die->u.p1.die_hash below are alwasy done/correct. > But brackets aren't really needed. > >> /* Ignore DW_AT_comp_dir for DW_AT_*_file >> etc. if immediately followed by DW_AT_*_line 0. */ >> else if (cu_file->file_angle_brackets_encapsulated_no_slash >> @@ -3427,7 +3439,13 @@ checksum_die (DSO *dso, dw_cu_ref cu, dw_die_ref top_die, dw_die_ref die) >> ? DW_AT_decl_line : DW_AT_call_line) >> && t->attr[i + 1].form == DW_FORM_data1 >> && *new_ptr == 0) >> - break; >> + { >> + die->u.p1.die_hash = hash_digest (); >> + break; >> + } >> + >> + die->u.p1.die_hash = hash_digest (); > > I find the logic hard to follow here. What makes sure the > die->u.p1.die_hash is always updated? Isn't it updated twice > (redundantly) in the above case? And needless brackets. Yeah, it's hard because the 'break;' means the next call to: die->u.p1.die_hash = hash_digest (); is not executed. And die->u.p1.die_hash is used in the current version as initial hash value (and it's modified in each iterative_hash_object call). > >> if (cu->cu_comp_dir >> && (cu_file->dir ? cu_file->dir[0] >> : cu_file->file[0]) != '/') >> @@ -5541,7 +5559,7 @@ strp_eq2 (const void *p, const void *q) >> static hashval_t >> strp_hash3 (const void *p) >> { >> - return iterative_hash (p, strlen (p), 0); >> + return hash (p, strlen (p)); >> } >> /* Corresponding equality function in strp_htab. */ > > OK. > >> @@ -5571,7 +5589,7 @@ note_strp_offset (unsigned int off) >> p = debug_sections[DEBUG_STR].data + off; >> len = strlen ((char *) p); >> - hash = iterative_hash (p, len, 0); >> + hash = hash (p, len); >> if (alt_strp_htab) >> { >> if (htab_find_with_hash (alt_strp_htab, p, hash)) > > OK. > >> @@ -5653,7 +5671,7 @@ lookup_strp_offset (unsigned int off) >> p = debug_sections[DEBUG_STR].data + off; >> len = strlen ((char *) p); >> - hash = iterative_hash (p, len, 0); >> + hash = hash (p, len); >> if (alt_strp_htab) >> { >> unsigned char *q = (unsigned char *) > > OK. > >> @@ -5694,7 +5712,7 @@ note_strp_offset2 (unsigned int off) >> { >> p = debug_sections[DEBUG_STR].data + off; >> len = strlen ((char *) p); >> - hash = iterative_hash (p, len, 0); >> + hash = hash (p, len); >> if (htab_find_with_hash (alt_strp_htab, p, hash)) >> return dwarf_5 ? DW_FORM_strp_sup : DW_FORM_GNU_strp_alt; >> } > > OK. > >> @@ -5704,7 +5722,7 @@ note_strp_offset2 (unsigned int off) >> return DW_FORM_strp; >> p = debug_sections[DEBUG_STR].data + off; >> q = (unsigned char *) strchr ((char *) p, '\0'); >> - hash = iterative_hash (p, q - p, 0); >> + hash = hash (p, q - p); >> se.off = off; >> se.new_off = hash & ~1U; >> struct strp_entry *s = (struct strp_entry *) > > OK. > >> @@ -5860,7 +5878,7 @@ finalize_strp (bool build_tail_offset_list) >> memcpy (p, debug_sections[DEBUG_STR].data + arr[i]->off, len); >> slot = htab_find_slot_with_hash (strp_htab, p, >> - iterative_hash (p, len - 1, 0), >> + hash (p, len - 1), >> INSERT); >> if (slot == NULL) >> dwz_oom (); > > OK. > >> @@ -5876,7 +5894,7 @@ finalize_strp (bool build_tail_offset_list) >> if (tail_offset_list != NULL) >> tail_offset_list[k++] = arr[j]->new_off; >> slot = htab_find_slot_with_hash (strp_htab, q, >> - iterative_hash (q, l - 1, 0), >> + hash (q, l - 1), >> INSERT); >> if (slot == NULL) >> dwz_oom (); > > OK. > >> @@ -9849,17 +9867,16 @@ line_htab_lookup (dw_cu_ref cu, unsigned int id) >> { >> void **slot; >> struct line_entry le; >> - hashval_t h; >> - >> if (id == 0) >> return 0; >> assert (id <= cu->cu_nfiles); >> le.file = &cu->cu_files[id - 1]; >> - h = iterative_hash_object (le.file->time, 0); >> - h = iterative_hash_object (le.file->size, h); >> - h = iterative_hash (le.file->file, strlen (le.file->file) + 1, h); >> + hash_init_state (); >> + hash_update_state (le.file->time); >> + hash_update_state (le.file->size); >> + hash_update_state_object (le.file->file, strlen (le.file->file) + 1); >> if (le.file->dir) >> - h = iterative_hash (le.file->dir, strlen (le.file->dir) + 1, h); >> + hash_update_state_object (le.file->dir, strlen (le.file->dir) + 1); >> if (line_htab == NULL) >> { >> line_htab = htab_try_create (50, line_hash, line_eq, NULL); >> @@ -9867,15 +9884,15 @@ line_htab_lookup (dw_cu_ref cu, unsigned int id) >> dwz_oom (); >> max_line_id = 1; >> } >> - le.hash = h; >> - slot = htab_find_slot_with_hash (line_htab, &le, h, INSERT); >> + le.hash = hash_digest ();; >> + slot = htab_find_slot_with_hash (line_htab, &le, le.hash, INSERT); > > OK, but double ;; Fixed. > >> if (slot == NULL) >> dwz_oom (); >> if (*slot == NULL) >> { >> struct line_entry *l = pool_alloc (line_entry, sizeof (*l)); >> l->file = le.file; >> - l->hash = h; >> + l->hash = le.hash; >> l->new_id = max_line_id++; >> *slot = (void *) l; >> return l->new_id; > > OK. > >> @@ -10347,7 +10364,7 @@ handle_macro (void) >> /* This should only happen if there were multiple >> same transparent units within a single object file. */ >> && htab_find_with_hash (strp_htab, p, >> - iterative_hash (p, len, 0)) == NULL) >> + hash (p, len)) == NULL) >> can_share = false; >> s = ptr; >> break; > > OK. > >> @@ -15756,7 +15773,7 @@ optimize_multifile (unsigned int *die_count) >> hashval_t hash; >> q = (unsigned char *) strchr ((char *) p, '\0'); >> - hash = iterative_hash (p, q - p, 0); >> + hash = hash (p, q - p); >> se.off = p - debug_sections[DEBUG_STR].data; >> se.new_off = hash & ~1U; >> slot = htab_find_slot_with_hash (strp_htab, &se, se.new_off, INSERT); > > OK. > >> @@ -16076,8 +16093,7 @@ read_multifile (int fd, unsigned int die_count) >> { >> q = (unsigned char *) strchr ((char *) p, '\0') + 1; >> slot = htab_find_slot_with_hash (strp_htab, p, >> - iterative_hash (p, q - p - 1, >> - 0), INSERT); >> + hash (p, q - p - 1), INSERT); >> if (slot == NULL) >> dwz_oom (); >> assert (*slot == NULL); > > OK. > >> @@ -16090,8 +16106,7 @@ read_multifile (int fd, unsigned int die_count) >> p = debug_sections[DEBUG_STR].data + *pi; >> q = (unsigned char *) strchr ((char *) p, '\0'); >> slot = htab_find_slot_with_hash (strp_htab, p, >> - iterative_hash (p, q - p, >> - 0), INSERT); >> + hash (p, q - p), INSERT); >> if (slot == NULL) >> dwz_oom (); >> assert (*slot == NULL); > > OK. > >> diff --git a/hashtab.c b/hashtab.c >> index 41eab30..a9b9d13 100644 >> --- a/hashtab.c >> +++ b/hashtab.c >> @@ -626,142 +626,3 @@ htab_restore (htab, name, restorefn) >> fclose (f); >> } >> #endif >> - >> -/* DERIVED FROM: >> --------------------------------------------------------------------- >> -lookup2.c, by Bob Jenkins, December 1996, Public Domain. >> -hash(), hash2(), hash3, and mix() are externally useful functions. >> -Routines to test the hash are included if SELF_TEST is defined. >> -You can use this free for any purpose. It has no warranty. >> --------------------------------------------------------------------- >> -*/ >> - >> -/* >> --------------------------------------------------------------------- >> -mix -- mix 3 32-bit values reversibly. >> -For every delta with one or two bit set, and the deltas of all three >> - high bits or all three low bits, whether the original value of a,b,c >> - is almost all zero or is uniformly distributed, >> -* If mix() is run forward or backward, at least 32 bits in a,b,c >> - have at least 1/4 probability of changing. >> -* If mix() is run forward, every bit of c will change between 1/3 and >> - 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.) >> -mix() was built out of 36 single-cycle latency instructions in a >> - structure that could supported 2x parallelism, like so: >> - a -= b; >> - a -= c; x = (c>>13); >> - b -= c; a ^= x; >> - b -= a; x = (a<<8); >> - c -= a; b ^= x; >> - c -= b; x = (b>>13); >> - ... >> - Unfortunately, superscalar Pentiums and Sparcs can't take advantage >> - of that parallelism. They've also turned some of those single-cycle >> - latency instructions into multi-cycle latency instructions. Still, >> - this is the fastest good hash I could find. There were about 2^^68 >> - to choose from. I only looked at a billion or so. >> --------------------------------------------------------------------- >> -*/ >> -/* same, but slower, works on systems that might have 8 byte hashval_t's */ >> -#define mix(a,b,c) \ >> -{ \ >> - a -= b; a -= c; a ^= (c>>13); \ >> - b -= c; b -= a; b ^= (a<< 8); \ >> - c -= a; c -= b; c ^= ((b&0xffffffff)>>13); \ >> - a -= b; a -= c; a ^= ((c&0xffffffff)>>12); \ >> - b -= c; b -= a; b = (b ^ (a<<16)) & 0xffffffff; \ >> - c -= a; c -= b; c = (c ^ (b>> 5)) & 0xffffffff; \ >> - a -= b; a -= c; a = (a ^ (c>> 3)) & 0xffffffff; \ >> - b -= c; b -= a; b = (b ^ (a<<10)) & 0xffffffff; \ >> - c -= a; c -= b; c = (c ^ (b>>15)) & 0xffffffff; \ >> -} >> - >> -/* >> --------------------------------------------------------------------- >> -hash() -- hash a variable-length key into a 32-bit value >> - k : the key (the unaligned variable-length array of bytes) >> - len : the length of the key, counting by bytes >> - level : can be any 4-byte value >> -Returns a 32-bit value. Every bit of the key affects every bit of >> -the return value. Every 1-bit and 2-bit delta achieves avalanche. >> -About 36+6len instructions. >> - >> -The best hash table sizes are powers of 2. There is no need to do >> -mod a prime (mod is sooo slow!). If you need less than 32 bits, >> -use a bitmask. For example, if you need only 10 bits, do >> - h = (h & hashmask(10)); >> -In which case, the hash table should have hashsize(10) elements. >> - >> -If you are hashing n strings (ub1 **)k, do it like this: >> - for (i=0, h=0; i> - >> -By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use this >> -code any way you wish, private, educational, or commercial. It's free. >> - >> -See http://burtleburtle.net/bob/hash/evahash.html >> -Use for hash table lookup, or anything where one collision in 2^32 is >> -acceptable. Do NOT use for cryptographic purposes. >> --------------------------------------------------------------------- >> -*/ >> - >> -hashval_t >> -iterative_hash (const void *k_in /* the key */, >> - register size_t length /* the length of the key */, >> - register hashval_t initval /* the previous hash, or >> - an arbitrary value */) >> -{ >> - register const unsigned char *k = (const unsigned char *)k_in; >> - register hashval_t a,b,c,len; >> - >> - /* Set up the internal state */ >> - len = length; >> - a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ >> - c = initval; /* the previous hash value */ >> - >> - /*---------------------------------------- handle most of the key */ >> -#ifndef WORDS_BIGENDIAN >> - /* On a little-endian machine, if the data is 4-byte aligned we can hash >> - by word for better speed. This gives nondeterministic results on >> - big-endian machines. */ >> - if (sizeof (hashval_t) == 4 && (((size_t)k)&3) == 0) >> - while (len >= 12) /* aligned */ >> - { >> - a += *(hashval_t *)(k+0); >> - b += *(hashval_t *)(k+4); >> - c += *(hashval_t *)(k+8); >> - mix(a,b,c); >> - k += 12; len -= 12; >> - } >> - else /* unaligned */ >> -#endif >> - while (len >= 12) >> - { >> - a += (k[0] +((hashval_t)k[1]<<8) +((hashval_t)k[2]<<16) +((hashval_t)k[3]<<24)); >> - b += (k[4] +((hashval_t)k[5]<<8) +((hashval_t)k[6]<<16) +((hashval_t)k[7]<<24)); >> - c += (k[8] +((hashval_t)k[9]<<8) +((hashval_t)k[10]<<16)+((hashval_t)k[11]<<24)); >> - mix(a,b,c); >> - k += 12; len -= 12; >> - } >> - >> - /*------------------------------------- handle the last 11 bytes */ >> - c += length; >> - switch(len) /* all the case statements fall through */ >> - { >> - case 11: c+=((hashval_t)k[10]<<24); /* fall through */ >> - case 10: c+=((hashval_t)k[9]<<16); /* fall through */ >> - case 9 : c+=((hashval_t)k[8]<<8); /* fall through */ >> - /* the first byte of c is reserved for the length */ >> - case 8 : b+=((hashval_t)k[7]<<24); /* fall through */ >> - case 7 : b+=((hashval_t)k[6]<<16); /* fall through */ >> - case 6 : b+=((hashval_t)k[5]<<8); /* fall through */ >> - case 5 : b+=k[4]; /* fall through */ >> - case 4 : a+=((hashval_t)k[3]<<24); /* fall through */ >> - case 3 : a+=((hashval_t)k[2]<<16); /* fall through */ >> - case 2 : a+=((hashval_t)k[1]<<8); /* fall through */ >> - case 1 : a+=k[0]; >> - /* case 0: nothing left to add */ >> - } >> - mix(a,b,c); >> - /*-------------------------------------------- report the result */ >> - return c; >> -} >> diff --git a/hashtab.h b/hashtab.h >> index cb3da01..509e627 100644 >> --- a/hashtab.h >> +++ b/hashtab.h >> @@ -153,9 +153,11 @@ extern void htab_restore (htab_t, const char *, htab_restorefn); >> #endif >> -/* An iterative hash function for arbitrary data. */ >> -extern hashval_t iterative_hash (const void *, size_t, hashval_t); >> +#include >> + >> /* Shorthand for hashing something with an intrinsic size. */ >> +#define hash(IN,LEN) XXH64(IN, LEN, 0) >> +#define iterative_hash(IN,LEN,INIT) XXH64(IN, LEN, INIT) >> #define iterative_hash_object(OB,INIT) iterative_hash (&OB, sizeof (OB), INIT) >> #ifdef __cplusplus > > See above for the question why everything hash related isn't defined > in the same place. Done. > > How much of hashtab.{c,h} are we still using? wc -l hashtab.[ch] 628 hashtab.c 160 hashtab.h 788 total with my patch. Please take a look at the updated patch. Martin > > Thanks, > > Mark