public inbox for gcc-bugs@sourceware.org
help / color / mirror / Atom feed
From: "jakub at gcc dot gnu.org" <gcc-bugzilla@gcc.gnu.org>
To: gcc-bugs@gcc.gnu.org
Subject: [Bug c++/100977] [C++23] Implement C++ Identifier Syntax using Unicode Standard Annex 31
Date: Wed, 04 Aug 2021 16:14:50 +0000	[thread overview]
Message-ID: <bug-100977-4-9xdw2hjG5w@http.gcc.gnu.org/bugzilla/> (raw)
In-Reply-To: <bug-100977-4@http.gcc.gnu.org/bugzilla/>

https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100977

--- Comment #3 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
Incrementally, here is a makeucnid.c patch to also emit CXX23 and NXX23 flags
(CXX23 for valid as C++23 identifier and NXX23 for valid as C++23 identifier
but not as the first character), but doesn't contain changes to actually handle
it on the libcpp side.

--- libcpp/makeucnid.c.jj       2021-08-04 17:35:35.995944075 +0200
+++ libcpp/makeucnid.c  2021-08-04 18:13:56.399062234 +0200
@@ -17,7 +17,7 @@ along with this program; see the file CO

 /* Run this program as
    ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \
-       > ucnid.h
+      DerivedCoreProperties.txt > ucnid.h
 */

 #include <stdio.h>
@@ -32,10 +32,12 @@ enum {
   N99 = 4,
   C11 = 8,
   N11 = 16,
-  all_languages = C99 | CXX | C11,
-  not_NFC = 32,
-  not_NFKC = 64,
-  maybe_not_NFC = 128
+  CXX23 = 32,
+  NXX23 = 64,
+  all_languages = C99 | CXX | C11 | CXX23 | NXX23,
+  not_NFC = 128,
+  not_NFKC = 256,
+  maybe_not_NFC = 512
 };

 #define NUM_CODE_POINTS 0x110000
@@ -241,6 +243,74 @@ read_derived (const char *fname)
   fclose (f);
 }

+/* Read DerivedCoreProperties.txt and fill in languages version in
+   flags from the XID_Start and XID_Continue properties.  */
+
+static void
+read_derivedcore (char *fname)
+{
+  FILE * f = fopen (fname, "r");
+  
+  if (!f)
+    fail ("opening DerivedCoreProperties.txt");
+  for (;;)
+    {
+      char line[256];
+      unsigned long codepoint_start, codepoint_end;
+      char *l;
+      int i, j;
+
+      if (!fgets (line, sizeof (line), f))
+       break;
+      if (line[0] == '#' || line[0] == '\n' || line[0] == '\r')
+       continue;
+      codepoint_start = strtoul (line, &l, 16);
+      if (l == line)
+       fail ("parsing DerivedCoreProperties.txt, reading code point");
+      if (codepoint_start > MAX_CODE_POINT)
+       fail ("parsing DerivedCoreProperties.txt, code point too large");
+      
+      if (*l == '.' && l[1] == '.')
+       {
+         char *l2 = l + 2;
+         codepoint_end = strtoul (l + 2, &l, 16);
+         if (l == l2 || codepoint_end < codepoint_start)
+           fail ("parsing DerivedCoreProperties.txt, reading code point");
+         if (codepoint_end > MAX_CODE_POINT)
+           fail ("parsing DerivedCoreProperties.txt, code point too large");
+       }
+      else
+       codepoint_end = codepoint_start;
+
+      while (*l == ' ')
+       l++;
+      if (*l++ != ';')
+       fail ("parsing DerivedCoreProperties.txt, reading code point");
+
+      while (*l == ' ')
+       l++;
+
+      if (codepoint_end < 0x80)
+        continue;
+
+      if (strncmp (l, "XID_Start ", 10) == 0)
+       {
+         for (; codepoint_start <= codepoint_end; codepoint_start++)
+           flags[codepoint_start]
+             = (flags[codepoint_start] | CXX23) & ~NXX23;
+       }
+      else if (strncmp (l, "XID_Continue ", 13) == 0)
+       {
+         for (; codepoint_start <= codepoint_end; codepoint_start++)
+           if ((flags[codepoint_start] & CXX23) == 0)
+             flags[codepoint_start] |= CXX23 | NXX23;
+       }
+    }
+  if (ferror (f))
+    fail ("reading DerivedCoreProperties.txt");
+  fclose (f);
+}
+
 /* Write out the table.
    The table consists of two words per entry.  The first word is the flags
    for the unicode code points up to and including the second word.  */
@@ -261,12 +331,14 @@ write_table (void)
        || really_safe != (decomp[i][0] == 0)
        || combining_value[i] != last_combine)
       {
-       printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
+       printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
                last_flag & C99 ? "C99" : "  0",
                last_flag & N99 ? "N99" : "  0",
                last_flag & CXX ? "CXX" : "  0",
                last_flag & C11 ? "C11" : "  0",
                last_flag & N11 ? "N11" : "  0",
+               last_flag & CXX23 ? "CXX23" : "    0",
+               last_flag & NXX23 ? "NXX23" : "    0",
                really_safe ? "CID" : "  0",
                last_flag & not_NFC ? "  0" : "NFC",
                last_flag & not_NFKC ? "  0" : "NKC",
@@ -439,11 +511,12 @@ write_copyright (void)
 int
 main(int argc, char ** argv)
 {
-  if (argc != 4)
+  if (argc != 5)
     fail ("too few arguments to makeucn");
   read_ucnid (argv[1]);
   read_table (argv[2]);
   read_derived (argv[3]);
+  read_derivedcore (argv[4]);

   write_copyright ();
   write_table ();

  parent reply	other threads:[~2021-08-04 16:14 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-08 18:15 [Bug c++/100977] New: " jason at gcc dot gnu.org
2021-06-08 18:19 ` [Bug c++/100977] " mpolacek at gcc dot gnu.org
2021-08-04 13:39 ` jakub at gcc dot gnu.org
2021-08-04 14:08 ` jakub at gcc dot gnu.org
2021-08-04 16:14 ` jakub at gcc dot gnu.org [this message]
2021-08-04 18:34 ` joseph at codesourcery dot com
2021-08-04 18:40 ` jakub at gcc dot gnu.org
2021-08-04 19:06 ` ubizjak at gmail dot com
2021-08-04 19:20 ` jakub at gcc dot gnu.org
2021-08-04 19:25 ` ubizjak at gmail dot com
2021-08-05 10:17 ` jakub at gcc dot gnu.org
2021-08-05 15:34 ` cvs-commit at gcc dot gnu.org
2021-08-05 15:35 ` cvs-commit at gcc dot gnu.org
2021-09-01 20:37 ` cvs-commit at gcc dot gnu.org
2021-09-01 20:38 ` jakub at gcc dot gnu.org
2021-11-30  8:51 ` cvs-commit at gcc dot gnu.org
2021-12-01  9:22 ` cvs-commit at gcc dot gnu.org

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=bug-100977-4-9xdw2hjG5w@http.gcc.gnu.org/bugzilla/ \
    --to=gcc-bugzilla@gcc.gnu.org \
    --cc=gcc-bugs@gcc.gnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).