From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 48) id D9D0F3850438; Wed, 4 Aug 2021 16:14:50 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org D9D0F3850438 From: "jakub at gcc dot gnu.org" To: gcc-bugs@gcc.gnu.org Subject: [Bug c++/100977] [C++23] Implement C++ Identifier Syntax using Unicode Standard Annex 31 Date: Wed, 04 Aug 2021 16:14:50 +0000 X-Bugzilla-Reason: CC X-Bugzilla-Type: changed X-Bugzilla-Watch-Reason: None X-Bugzilla-Product: gcc X-Bugzilla-Component: c++ X-Bugzilla-Version: 12.0 X-Bugzilla-Keywords: X-Bugzilla-Severity: normal X-Bugzilla-Who: jakub at gcc dot gnu.org X-Bugzilla-Status: NEW X-Bugzilla-Resolution: X-Bugzilla-Priority: P3 X-Bugzilla-Assigned-To: unassigned at gcc dot gnu.org X-Bugzilla-Target-Milestone: --- X-Bugzilla-Flags: X-Bugzilla-Changed-Fields: Message-ID: In-Reply-To: References: Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable X-Bugzilla-URL: http://gcc.gnu.org/bugzilla/ Auto-Submitted: auto-generated MIME-Version: 1.0 X-BeenThere: gcc-bugs@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc-bugs mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 04 Aug 2021 16:14:51 -0000 https://gcc.gnu.org/bugzilla/show_bug.cgi?id=3D100977 --- Comment #3 from Jakub Jelinek --- Incrementally, here is a makeucnid.c patch to also emit CXX23 and NXX23 fla= gs (CXX23 for valid as C++23 identifier and NXX23 for valid as C++23 identifier but not as the first character), but doesn't contain changes to actually ha= ndle it on the libcpp side. --- libcpp/makeucnid.c.jj 2021-08-04 17:35:35.995944075 +0200 +++ libcpp/makeucnid.c 2021-08-04 18:13:56.399062234 +0200 @@ -17,7 +17,7 @@ along with this program; see the file CO /* Run this program as ./makeucnid ucnid.tab UnicodeData.txt DerivedNormalizationProps.txt \ - > ucnid.h + DerivedCoreProperties.txt > ucnid.h */ #include @@ -32,10 +32,12 @@ enum { N99 =3D 4, C11 =3D 8, N11 =3D 16, - all_languages =3D C99 | CXX | C11, - not_NFC =3D 32, - not_NFKC =3D 64, - maybe_not_NFC =3D 128 + CXX23 =3D 32, + NXX23 =3D 64, + all_languages =3D C99 | CXX | C11 | CXX23 | NXX23, + not_NFC =3D 128, + not_NFKC =3D 256, + maybe_not_NFC =3D 512 }; #define NUM_CODE_POINTS 0x110000 @@ -241,6 +243,74 @@ read_derived (const char *fname) fclose (f); } +/* Read DerivedCoreProperties.txt and fill in languages version in + flags from the XID_Start and XID_Continue properties. */ + +static void +read_derivedcore (char *fname) +{ + FILE * f =3D fopen (fname, "r"); +=20=20 + if (!f) + fail ("opening DerivedCoreProperties.txt"); + for (;;) + { + char line[256]; + unsigned long codepoint_start, codepoint_end; + char *l; + int i, j; + + if (!fgets (line, sizeof (line), f)) + break; + if (line[0] =3D=3D '#' || line[0] =3D=3D '\n' || line[0] =3D=3D '\r') + continue; + codepoint_start =3D strtoul (line, &l, 16); + if (l =3D=3D line) + fail ("parsing DerivedCoreProperties.txt, reading code point"); + if (codepoint_start > MAX_CODE_POINT) + fail ("parsing DerivedCoreProperties.txt, code point too large"); +=20=20=20=20=20=20 + if (*l =3D=3D '.' && l[1] =3D=3D '.') + { + char *l2 =3D l + 2; + codepoint_end =3D strtoul (l + 2, &l, 16); + if (l =3D=3D l2 || codepoint_end < codepoint_start) + fail ("parsing DerivedCoreProperties.txt, reading code point"); + if (codepoint_end > MAX_CODE_POINT) + fail ("parsing DerivedCoreProperties.txt, code point too large"= ); + } + else + codepoint_end =3D codepoint_start; + + while (*l =3D=3D ' ') + l++; + if (*l++ !=3D ';') + fail ("parsing DerivedCoreProperties.txt, reading code point"); + + while (*l =3D=3D ' ') + l++; + + if (codepoint_end < 0x80) + continue; + + if (strncmp (l, "XID_Start ", 10) =3D=3D 0) + { + for (; codepoint_start <=3D codepoint_end; codepoint_start++) + flags[codepoint_start] + =3D (flags[codepoint_start] | CXX23) & ~NXX23; + } + else if (strncmp (l, "XID_Continue ", 13) =3D=3D 0) + { + for (; codepoint_start <=3D codepoint_end; codepoint_start++) + if ((flags[codepoint_start] & CXX23) =3D=3D 0) + flags[codepoint_start] |=3D CXX23 | NXX23; + } + } + if (ferror (f)) + fail ("reading DerivedCoreProperties.txt"); + fclose (f); +} + /* Write out the table. The table consists of two words per entry. The first word is the flags for the unicode code points up to and including the second word. */ @@ -261,12 +331,14 @@ write_table (void) || really_safe !=3D (decomp[i][0] =3D=3D 0) || combining_value[i] !=3D last_combine) { - printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", + printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", last_flag & C99 ? "C99" : " 0", last_flag & N99 ? "N99" : " 0", last_flag & CXX ? "CXX" : " 0", last_flag & C11 ? "C11" : " 0", last_flag & N11 ? "N11" : " 0", + last_flag & CXX23 ? "CXX23" : " 0", + last_flag & NXX23 ? "NXX23" : " 0", really_safe ? "CID" : " 0", last_flag & not_NFC ? " 0" : "NFC", last_flag & not_NFKC ? " 0" : "NKC", @@ -439,11 +511,12 @@ write_copyright (void) int main(int argc, char ** argv) { - if (argc !=3D 4) + if (argc !=3D 5) fail ("too few arguments to makeucn"); read_ucnid (argv[1]); read_table (argv[2]); read_derived (argv[3]); + read_derivedcore (argv[4]); write_copyright (); write_table ();=