From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <tromey@adacore.com>
Received: from mail-qt1-x836.google.com (mail-qt1-x836.google.com
 [IPv6:2607:f8b0:4864:20::836])
 by sourceware.org (Postfix) with ESMTPS id 844C83858422
 for <gdb-patches@sourceware.org>; Thu, 22 Sep 2022 20:21:06 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 844C83858422
Received: by mail-qt1-x836.google.com with SMTP id j10so7157980qtv.4
 for <gdb-patches@sourceware.org>; Thu, 22 Sep 2022 13:21:06 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=content-transfer-encoding:mime-version:references:in-reply-to
 :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
 :subject:date;
 bh=lD58QyqvFz1klLtZzOrocmSlRmewKaWlbHL1q2GLK+o=;
 b=PKSxx8WZGZhTAPBtA2hJDGRmyEBw03GAac2E06HN/75CsgR7C1SWSEEL8oUWi9T4iW
 +DMqX4oa0ewPb0DRMYjn/CuO7MJCjAAp6Fs6mKozHEy5BQwtDUHKYGCLVvhKic9GPUHu
 yUCA4YDM8heBslGlsyrMl+WymeIMbswXv96jGSzrVs1BfpK1Ua5AYMu8kqcvbuXE4hka
 dNFAxz0DDN3Pu7KC0cZz5GjiTd8KkIyozDOwE7HmEcsePenNayakR4gJ1Yv1rgUUvve6
 uyFjG0wPW8PYdBu1YI2YuIKRzCRCC26Acpw8dc9nYgKCwAhblOV7DMj9kkhAqBbY7Nr4
 yf+Q==
X-Gm-Message-State: ACrzQf3QI8gUGWGYeea6j2phuM8u+o8odrtyTL43aTNmwHriKbATL21a
 rftYCql0fgZUXjdJNlB9XhYv7emQiPxK2Q==
X-Google-Smtp-Source: AMsMyM4m98MZYLrrdRysSBEH0k0bRZlsBnmMz5ZUaZkn319//xzMRy8MpkFvw9Y0fWo75Mo08gMI8A==
X-Received: by 2002:ac8:5f82:0:b0:35b:b35e:74ff with SMTP id
 j2-20020ac85f82000000b0035bb35e74ffmr4326934qta.574.1663878066070; 
 Thu, 22 Sep 2022 13:21:06 -0700 (PDT)
Received: from localhost.localdomain (71-211-160-49.hlrn.qwest.net.
 [71.211.160.49]) by smtp.gmail.com with ESMTPSA id
 cp4-20020a05622a420400b0035cdd7a42d0sm3869093qtb.22.2022.09.22.13.21.05
 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
 Thu, 22 Sep 2022 13:21:05 -0700 (PDT)
From: Tom Tromey <tromey@adacore.com>
To: gdb-patches@sourceware.org
Cc: Tom Tromey <tromey@adacore.com>
Subject: [PATCH 2/2] Change .gdb_index de-duplication implementation
Date: Thu, 22 Sep 2022 14:20:54 -0600
Message-Id: <20220922202054.2773698-3-tromey@adacore.com>
X-Mailer: git-send-email 2.34.3
In-Reply-To: <20220922202054.2773698-1-tromey@adacore.com>
References: <20220922202054.2773698-1-tromey@adacore.com>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-10.2 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, GIT_PATCH_0, KAM_STOCKGEN,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS,
 TXREP autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: gdb-patches@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gdb-patches mailing list <gdb-patches.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/gdb-patches>,
 <mailto:gdb-patches-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/gdb-patches/>
List-Post: <mailto:gdb-patches@sourceware.org>
List-Help: <mailto:gdb-patches-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/gdb-patches>,
 <mailto:gdb-patches-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Thu, 22 Sep 2022 20:21:12 -0000

While investigating PR symtab/29179, I found that one Ada test failed
because, although a certain symbol was present in the index, with the
new DWARF reader it pointed to a different CU than was chosen by
earlier versions of gdb.

This patch changes how symbol de-duplication is done, deferring the
process until the entire symbol table has been constructed.  This way,
it's possible to always choose the lower-numbered CU among duplicates,
which is how gdb (implicitly) previously worked.

Bug: https://sourceware.org/bugzilla/show_bug.cgi?id=29179
---
 gdb/dwarf2/index-write.c | 77 +++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 36 deletions(-)

diff --git a/gdb/dwarf2/index-write.c b/gdb/dwarf2/index-write.c
index 6940a0ce3be..ae05946e790 100644
--- a/gdb/dwarf2/index-write.c
+++ b/gdb/dwarf2/index-write.c
@@ -176,6 +176,10 @@ struct symtab_index_entry
   /* A sorted vector of the indices of all the CUs that hold an object
      of this name.  */
   std::vector<offset_type> cu_indices;
+
+  /* Minimize CU_INDICES, sorting them and removing duplicates as
+     appropriate.  */
+  void minimize ();
 };
 
 /* The symbol table.  This is a power-of-2-sized hash table.  */
@@ -186,6 +190,13 @@ struct mapped_symtab
     data.resize (1024);
   }
 
+  /* Minimize each entry in the symbol table, removing duplicates.  */
+  void minimize ()
+  {
+    for (symtab_index_entry &item : data)
+      item.minimize ();
+  }
+
   offset_type n_elements = 0;
   std::vector<symtab_index_entry> data;
 
@@ -271,21 +282,36 @@ add_index_entry (struct mapped_symtab *symtab, const char *name,
   slot.cu_indices.push_back (cu_index_and_attrs);
 }
 
-/* Sort and remove duplicates of all symbols' cu_indices lists.  */
+/* See symtab_index_entry.  */
 
-static void
-uniquify_cu_indices (struct mapped_symtab *symtab)
+void
+symtab_index_entry::minimize ()
 {
-  for (auto &entry : symtab->data)
+  if (name == nullptr || cu_indices.empty ())
+    return;
+
+  std::sort (cu_indices.begin (), cu_indices.end ());
+  auto from = std::unique (cu_indices.begin (), cu_indices.end ());
+  cu_indices.erase (from, cu_indices.end ());
+
+  /* We don't want to enter a variable or type more than once, so
+     remove any such duplicates from the list as well.  When doing
+     this, we want to keep the entry from the first CU -- but this is
+     implicit due to the sort.  This choice is done because it's
+     similar to what gdb historically did for partial symbols.  */
+  std::unordered_set<offset_type> seen;
+  from = std::remove_if (cu_indices.begin (), cu_indices.end (),
+			 [&] (offset_type val)
     {
-      if (entry.name != NULL && !entry.cu_indices.empty ())
-	{
-	  auto &cu_indices = entry.cu_indices;
-	  std::sort (cu_indices.begin (), cu_indices.end ());
-	  auto from = std::unique (cu_indices.begin (), cu_indices.end ());
-	  cu_indices.erase (from, cu_indices.end ());
-	}
-    }
+      gdb_index_symbol_kind kind = GDB_INDEX_SYMBOL_KIND_VALUE (val);
+      if (kind != GDB_INDEX_SYMBOL_KIND_TYPE
+	  && kind != GDB_INDEX_SYMBOL_KIND_VARIABLE)
+	return false;
+
+      val &= ~GDB_INDEX_CU_MASK;
+      return !seen.insert (val).second;
+    });
+  cu_indices.erase (from, cu_indices.end ());
 }
 
 /* A form of 'const char *' suitable for container keys.  Only the
@@ -1103,15 +1129,6 @@ write_cooked_index (cooked_index_vector *table,
 		    const cu_index_map &cu_index_htab,
 		    struct mapped_symtab *symtab)
 {
-  /* We track type names and only enter a given type once.  */
-  htab_up type_names (htab_create_alloc (10, htab_hash_string, htab_eq_string,
-					 nullptr, xcalloc, xfree));
-  /* Same with variable names.  However, if a type and variable share
-     a name, we want both, which is why there are two hash tables
-     here.  */
-  htab_up var_names (htab_create_alloc (10, htab_hash_string, htab_eq_string,
-					nullptr, xcalloc, xfree));
-
   const char *main_for_ada = main_name ();
 
   for (const cooked_index_entry *entry : table->all_entries ())
@@ -1159,24 +1176,12 @@ write_cooked_index (cooked_index_vector *table,
       else if (entry->tag == DW_TAG_variable
 	       || entry->tag == DW_TAG_constant
 	       || entry->tag == DW_TAG_enumerator)
-	{
-	  kind = GDB_INDEX_SYMBOL_KIND_VARIABLE;
-	  void **slot = htab_find_slot (var_names.get (), name, INSERT);
-	  if (*slot != nullptr)
-	    continue;
-	  *slot = (void *) name;
-	}
+	kind = GDB_INDEX_SYMBOL_KIND_VARIABLE;
       else if (entry->tag == DW_TAG_module
 	       || entry->tag == DW_TAG_common_block)
 	kind = GDB_INDEX_SYMBOL_KIND_OTHER;
       else
-	{
-	  kind = GDB_INDEX_SYMBOL_KIND_TYPE;
-	  void **slot = htab_find_slot (type_names.get (), name, INSERT);
-	  if (*slot != nullptr)
-	    continue;
-	  *slot = (void *) name;
-	}
+	kind = GDB_INDEX_SYMBOL_KIND_TYPE;
 
       add_index_entry (symtab, name, (entry->flags & IS_STATIC) != 0,
 		       kind, it->second);
@@ -1254,7 +1259,7 @@ write_gdbindex (dwarf2_per_objfile *per_objfile,
 
   /* Now that we've processed all symbols we can shrink their cu_indices
      lists.  */
-  uniquify_cu_indices (&symtab);
+  symtab.minimize ();
 
   data_buf symtab_vec, constant_pool;
   if (symtab.n_elements == 0)
-- 
2.34.3