From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <paulhollinsky@gmail.com>
Received: from mail-qk1-x72a.google.com (mail-qk1-x72a.google.com
 [IPv6:2607:f8b0:4864:20::72a])
 by sourceware.org (Postfix) with ESMTPS id EEEF53858D28
 for <gcc-patches@gcc.gnu.org>; Wed,  6 Jul 2022 23:03:50 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org EEEF53858D28
Received: by mail-qk1-x72a.google.com with SMTP id z12so12198026qki.3
 for <gcc-patches@gcc.gnu.org>; Wed, 06 Jul 2022 16:03:50 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:from:to:cc:subject:date:message-id:mime-version
 :content-transfer-encoding;
 bh=kfTgV6XLo1gY8xaiicaiecVxMMG5luOcrA5SlBlRBwM=;
 b=PQN2P13sFQcgiat9AOLqlZMpwIvJ8kPNqvxgjxCHxMQCxe96xac50QgF10LnoJEfim
 Tm8TZ7PNcHDrR5njR6fed6JM+eQc0lskPpWJqOKntCsG+3FCrFUfwajfuZfCblV1xnhy
 Gdreb/8BvJc7tV6PthgU0qp5zLtcub52CCEuGq0IQi00OPqvvhcGdhGVAl1vqg45B6Jn
 LF2PmjF2Qpjzzfk+naw0RQ0eFUAE48RxVyizmcxKBrWAqmO4ZsceC2vgSjO8bbD+HEr3
 S9YMwuoDC1LZ7htjWHCFIjmJw5n3ouBbyODrAI4g9Vj2NvkGXxS9KEBsb6Jj7xO6C7/2
 TKSA==
X-Gm-Message-State: AJIora8h5IMQ1/UnavvOyiRc2TOEjU3Xw741GSEYXSbdbwp4wzVkAaLZ
 slBhsFgKm8Y+BWFWxTADfvZUUiyUSQTMeP3n
X-Google-Smtp-Source: AGRyM1s2eD9siQABXriQ5ju+/+oqRWHN6I+P+OJFcBlnzQueGTnisESlqNk4mVDRkUBUng/EbGOmEQ==
X-Received: by 2002:a37:9a88:0:b0:6b5:503a:c480 with SMTP id
 c130-20020a379a88000000b006b5503ac480mr114038qke.107.1657148629696; 
 Wed, 06 Jul 2022 16:03:49 -0700 (PDT)
Received: from localhost.localdomain ([91.90.120.161])
 by smtp.gmail.com with ESMTPSA id
 fb10-20020a05622a480a00b00304e2e4bf1esm25467420qtb.88.2022.07.06.16.03.46
 (version=TLS1_2 cipher=ECDHE-ECDSA-AES128-GCM-SHA256 bits=128/128);
 Wed, 06 Jul 2022 16:03:47 -0700 (PDT)
From: Paul Hollinsky <paulhollinsky@gmail.com>
To: gcc-patches@gcc.gnu.org
Cc: Paul Hollinsky <paulhollinsky@gmail.com>
Subject: [PATCH] libcpp: Optimize #pragma once with a hash table [PR58770]
Date: Wed,  6 Jul 2022 16:03:21 -0700
Message-Id: <20220706230321.48041-1-paulhollinsky@gmail.com>
X-Mailer: git-send-email 2.19.1
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-12.7 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: gcc-patches@gcc.gnu.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Gcc-patches mailing list <gcc-patches.gcc.gnu.org>
List-Unsubscribe: <https://gcc.gnu.org/mailman/options/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=unsubscribe>
List-Archive: <https://gcc.gnu.org/pipermail/gcc-patches/>
List-Post: <mailto:gcc-patches@gcc.gnu.org>
List-Help: <mailto:gcc-patches-request@gcc.gnu.org?subject=help>
List-Subscribe: <https://gcc.gnu.org/mailman/listinfo/gcc-patches>,
 <mailto:gcc-patches-request@gcc.gnu.org?subject=subscribe>
X-List-Received-Date: Wed, 06 Jul 2022 23:03:55 -0000

Rather than traversing the all_files linked list for every include,
this factors out the quick idempotency checks (modification time
and size) to be the keys in a hash table so we can find matching
files quickly.

The hash table value type is a linked list, in case more than one
file matches the quick check.

The table is only built if a once-only file is seen, so include
guard performance is not affected.

My laptop would previously complete Ricardo's benchmark from the
PR in ~1.1s using #pragma once, and ~0.35s using include guards.

After this change, both benchmarks now complete in ~0.35s. I did
have to randomize the modification dates on the benchmark headers
so the files did not all end up in the same hash table list, but
that would likely not come up outside of the contrived benchmark.

libcpp/ChangeLog:

	PR preprocessor/58770
	* internal.h: Add hash table for #pragma once
	* files.cc: Optimize #pragma once with the hash table
---
 libcpp/files.cc   | 113 +++++++++++++++++++++++++++++++++++++++++++---
 libcpp/internal.h |   3 ++
 2 files changed, 110 insertions(+), 6 deletions(-)

diff --git a/libcpp/files.cc b/libcpp/files.cc
index 24208f7b0f8..51901aa31dd 100644
--- a/libcpp/files.cc
+++ b/libcpp/files.cc
@@ -167,6 +167,33 @@ struct file_hash_entry_pool
   struct cpp_file_hash_entry pool[FILE_HASH_POOL_SIZE];
 };
 
+/* A set of attributes designed to quickly identify obviously different files
+   in a hashtable.  Just in case there are collisions, we still maintain a
+   list.  These sub-lists can then be checked for #pragma once rather than
+   interating through all_files.  */
+struct file_quick_idempotency_attrs
+{
+  file_quick_idempotency_attrs(const _cpp_file *f)
+    : mtime(f->st.st_mtime), size(f->st.st_size) {}
+
+  time_t mtime;
+  off_t size;
+
+  static hashval_t hash (/* _cpp_file* */ const void *p);
+};
+
+/* Sub-list of very similar files kept in a hashtable to check for #pragma
+   once.  */
+struct file_sublist
+{
+  _cpp_file *f;
+  file_sublist *next;
+
+  static int eq (/* _cpp_file* */ const void *p,
+		 /* file_sublist* */ const void *q);
+  static void del (/* file_sublist* */ void *p);
+};
+
 static bool open_file (_cpp_file *file);
 static bool pch_open_file (cpp_reader *pfile, _cpp_file *file,
 			   bool *invalid_pch);
@@ -849,17 +876,18 @@ has_unique_contents (cpp_reader *pfile, _cpp_file *file, bool import,
   if (!pfile->seen_once_only)
     return true;
 
-  /* We may have read the file under a different name.  Look
-     for likely candidates and compare file contents to be sure.  */
-  for (_cpp_file *f = pfile->all_files; f; f = f->next_file)
+  /* We may have read the file under a different name.  We've kept
+     similar looking files in this lists under this hash table, so
+     check those more thoroughly.  */
+  void* ent = htab_find(pfile->pragma_once_files, file);
+  for (file_sublist *e = reinterpret_cast<file_sublist*> (ent); e; e = e->next)
     {
+      _cpp_file *f = e->f;
       if (f == file)
 	continue; /* It'sa me!  */
 
       if ((import || f->once_only)
-	  && f->err_no == 0
-	  && f->st.st_mtime == file->st.st_mtime
-	  && f->st.st_size == file->st.st_size)
+	  && f->err_no == 0)
 	{
 	  _cpp_file *ref_file;
 
@@ -895,6 +923,36 @@ has_unique_contents (cpp_reader *pfile, _cpp_file *file, bool import,
   return true;
 }
 
+/* Add the given file to the #pragma once table so it can be
+   quickly identified and excluded the next time it's seen.  */
+static void
+update_pragma_once_table (cpp_reader *pfile, _cpp_file *file)
+{
+  void **slot = htab_find_slot (pfile->pragma_once_files, file, INSERT);
+  if (slot)
+    {
+      if (!*slot)
+	*slot = xcalloc(1, sizeof(file_sublist));
+
+      file_sublist *e = reinterpret_cast<file_sublist*> (*slot);
+      while (e->f)
+	{
+	  if (!e->next)
+	    e->next = reinterpret_cast<file_sublist*> (
+			xcalloc(1, sizeof(file_sublist)));
+	  e = e->next;
+	}
+
+      e->f = file;
+    }
+  else
+    {
+      cpp_error (pfile, CPP_DL_ERROR,
+		 "Unable to create #pragma once table space for %s",
+		 _cpp_get_file_name(file));
+    }
+}
+
 /* Place the file referenced by FILE into a new buffer on the buffer
    stack if possible.  Returns true if a buffer is stacked.  Use LOC
    for any diagnostics.  */
@@ -950,6 +1008,9 @@ _cpp_stack_file (cpp_reader *pfile, _cpp_file *file, include_type type,
       if (!has_unique_contents (pfile, file, type == IT_IMPORT, loc))
 	return false;
 
+      if (pfile->seen_once_only && file->once_only)
+	update_pragma_once_table(pfile, file);
+
       if (pfile->buffer && file->dir)
 	sysp = MAX (pfile->buffer->sysp, file->dir->sysp);
 
@@ -1434,6 +1495,41 @@ nonexistent_file_hash_eq (const void *p, const void *q)
   return filename_cmp ((const char *) p, (const char *) q) == 0;
 }
 
+/* Hasher for the #pragma once hash table.  */
+hashval_t
+file_quick_idempotency_attrs::hash (const void *p)
+{
+  const _cpp_file *f = reinterpret_cast<const _cpp_file*> (p);
+  file_quick_idempotency_attrs kh (f);
+  return iterative_hash_object (kh, 0);
+}
+
+/* Equality checker for the #pragma once hash table.  */
+int
+file_sublist::eq (const void *p, const void *q)
+{
+  /* Just check if the file q would be in the list p. Every
+     file in the list should have these attributes the same,
+     so we don't need to traverse.  */
+  const file_sublist *e = reinterpret_cast<const file_sublist*> (p);
+  const _cpp_file *f = reinterpret_cast<const _cpp_file*> (q);
+  return f->st.st_mtime == e->f->st.st_mtime
+	 && f->st.st_size == e->f->st.st_size;
+}
+
+/* Cleanup for a file sub-list. Does not free the _cpp_file
+   structures within.  */
+void
+file_sublist::del (void *p)
+{
+  file_sublist *e = reinterpret_cast<file_sublist*> (p);
+  if (e->next)
+    {
+      file_sublist::del (e->next);
+      free (e->next);
+    }
+}
+
 /* Initialize everything in this source file.  */
 void
 _cpp_init_files (cpp_reader *pfile)
@@ -1442,6 +1538,10 @@ _cpp_init_files (cpp_reader *pfile)
 					NULL, xcalloc, free);
   pfile->dir_hash = htab_create_alloc (127, file_hash_hash, file_hash_eq,
 					NULL, xcalloc, free);
+  pfile->pragma_once_files = htab_create_alloc (127,
+					file_quick_idempotency_attrs::hash,
+					file_sublist::eq, file_sublist::del,
+					xcalloc, free);
   allocate_file_hash_entries (pfile);
   pfile->nonexistent_file_hash = htab_create_alloc (127, htab_hash_string,
 						    nonexistent_file_hash_eq,
@@ -1456,6 +1556,7 @@ _cpp_cleanup_files (cpp_reader *pfile)
 {
   htab_delete (pfile->file_hash);
   htab_delete (pfile->dir_hash);
+  htab_delete (pfile->pragma_once_files);
   htab_delete (pfile->nonexistent_file_hash);
   obstack_free (&pfile->nonexistent_file_ob, 0);
   free_file_hash_entries (pfile);
diff --git a/libcpp/internal.h b/libcpp/internal.h
index badfd1b40da..9c3c46df335 100644
--- a/libcpp/internal.h
+++ b/libcpp/internal.h
@@ -485,6 +485,9 @@ struct cpp_reader
      been used.  */
   bool seen_once_only;
 
+  /* Optimization for #pragma once.  */
+  struct htab *pragma_once_files;
+
   /* Multiple include optimization.  */
   const cpp_hashnode *mi_cmacro;
   const cpp_hashnode *mi_ind_cmacro;
-- 
2.19.1