public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [committed] preprocessor: C2x identifier rules
@ 2022-10-14 23:10 Joseph Myers
  0 siblings, 0 replies; only message in thread
From: Joseph Myers @ 2022-10-14 23:10 UTC (permalink / raw)
  To: gcc-patches

[-- Attachment #1: Type: text/plain, Size: 14736 bytes --]

C2x has, like C++, adopted rules for identifiers based directly on an
unversioned normative reference to Unicode.  Make libcpp follow those
rules for c2x / gnu2x standards (this involves bringing back a flag
separate from the C++ one for whether to use these identifier rules,
but this time enabled for all C++ language versions since that was the
conclusion adopted for C++ identifier handling).

There is one change here that affects C++.  I believe the new
normative requirement for NFC only applies to identifiers, not to the
use of identifier-continue characters in pp-numbers, where there is no
such requirement and so the diagnostic ought to be a warning not a
pedwarn in pp-numbers, and that this is the case for both C and C++.

Bootstrapped with no regressions for x86_64-pc-linux-gnu.

libcpp/
	* charset.cc (ucn_valid_in_identifier): Check xid_identifiers not
	cplusplus to determine whether to use CXX23 and NXX23 flags.
	* include/cpplib.h (struct cpp_options): Add xid_identifiers.
	* init.cc (struct lang_flags, lang_defaults): Add xid_identifiers.
	(cpp_set_lang): Set xid_identifiers.
	* lex.cc (warn_about_normalization): Add parameter identifier.
	Only pedwarn about non-NFC for identifiers, not pp-numbers.
	(_cpp_lex_direct): Update calls to warn_about_normalization.

gcc/testsuite/
	* gcc.dg/cpp/c2x-ucnid-1-utf8.c, gcc.dg/cpp/c2x-ucnid-1.c: New
	tests.

---

It would incidentally now be appropriate to update the Unicode data in
libcpp from Unicode 14 to Unicode 15.

diff --git a/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1-utf8.c b/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1-utf8.c
new file mode 100644
index 00000000000..55d22819563
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1-utf8.c
@@ -0,0 +1,13 @@
+/* Test C2x (= Unicode) rules for characters in identifiers.  */
+/* { dg-do preprocess } */
+/* { dg-options "-std=c2x -pedantic-errors" } */
+
+¨
+
+/* The requirement for NFC only applies in identifiers, not pp-numbers.  */
+
+À /* { dg-error "not in NFC" } */
+ÿÀ /* { dg-error "not in NFC" } */
+
+0À /* { dg-warning "not in NFC" } */
+.1À /* { dg-warning "not in NFC" } */
diff --git a/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1.c b/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1.c
new file mode 100644
index 00000000000..f9fdbea6ece
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/cpp/c2x-ucnid-1.c
@@ -0,0 +1,13 @@
+/* Test C2x (= Unicode) rules for characters in identifiers.  */
+/* { dg-do preprocess } */
+/* { dg-options "-std=c2x -pedantic-errors" } */
+
+\u00A8 /* { dg-error "is not valid in an identifier" } */
+
+/* The requirement for NFC only applies in identifiers, not pp-numbers.  */
+
+A\u0300 /* { dg-error "not in NFC" } */
+\u00ffA\u0300 /* { dg-error "not in NFC" } */
+
+0A\u0300 /* { dg-warning "not in NFC" } */
+.1A\u0300 /* { dg-warning "not in NFC" } */
diff --git a/libcpp/charset.cc b/libcpp/charset.cc
index 6834969a919..12a398e7527 100644
--- a/libcpp/charset.cc
+++ b/libcpp/charset.cc
@@ -1291,7 +1291,7 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
   valid_flags = C99 | CXX | C11 | CXX23;
   if (CPP_PEDANTIC (pfile))
     {
-      if (CPP_OPTION (pfile, cplusplus))
+      if (CPP_OPTION (pfile, xid_identifiers))
 	valid_flags = CXX23;
       else if (CPP_OPTION (pfile, c11_identifiers))
 	valid_flags = C11;
@@ -1355,7 +1355,7 @@ ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
       return 2;
     }
 
-  if (CPP_OPTION (pfile, cplusplus))
+  if (CPP_OPTION (pfile, xid_identifiers))
     invalid_start_flags = NXX23;
   else if (CPP_OPTION (pfile, c11_identifiers))
     invalid_start_flags = N11;
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index e97993e04bc..d5ef12a30ea 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -496,6 +496,10 @@ struct cpp_options
      in C11.  */
   unsigned char c11_identifiers;
 
+  /* Nonzero means extended identifiers allow the characters specified
+     by Unicode XID_Start and XID_Continue properties.  */
+  unsigned char xid_identifiers;
+
   /* Nonzero for C++ 2014 Standard binary constants.  */
   unsigned char binary_constants;
 
diff --git a/libcpp/init.cc b/libcpp/init.cc
index d3b4f00994b..5f34e3515d2 100644
--- a/libcpp/init.cc
+++ b/libcpp/init.cc
@@ -82,6 +82,7 @@ struct lang_flags
   char extended_numbers;
   char extended_identifiers;
   char c11_identifiers;
+  char xid_identifiers;
   char std;
   char digraphs;
   char uliterals;
@@ -102,31 +103,31 @@ struct lang_flags
 };
 
 static const struct lang_flags lang_defaults[] =
-{ /*              c99 c++ xnum xid c11 std digr ulit rlit udlit bincst digsep trig u8chlit vaopt scope dfp szlit elifdef warndir delim trufal */
-  /* GNUC89   */  { 0,  0,  1,  0,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    0 },
-  /* GNUC99   */  { 1,  0,  1,  1,  0,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    0 },
-  /* GNUC11   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    0 },
-  /* GNUC17   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    0 },
-  /* GNUC2X   */  { 1,  0,  1,  1,  1,  0,  1,   1,   1,   0,    1,     1,     0,   1,      1,   1,     1,   0,   1,      1,      0,    1 },
-  /* STDC89   */  { 0,  0,  0,  0,  0,  1,  0,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0,      0,    0 },
-  /* STDC94   */  { 0,  0,  0,  0,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0,      0,    0 },
-  /* STDC99   */  { 1,  0,  1,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0,      0,    0 },
-  /* STDC11   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0,      0,    0 },
-  /* STDC17   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0,      0,    0 },
-  /* STDC2X   */  { 1,  0,  1,  1,  1,  1,  1,   1,   0,   0,    1,     1,     0,   1,      0,   1,     1,   0,   1,      1,      0,    1 },
-  /* GNUCXX   */  { 0,  1,  1,  1,  0,  0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    1 },
-  /* CXX98    */  { 0,  1,  0,  1,  0,  1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   1,     0,   0,   0,      0,      0,    1 },
-  /* GNUCXX11 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    1 },
-  /* CXX11    */  { 1,  1,  0,  1,  1,  1,  1,   1,   1,   1,    0,     0,     1,   0,      0,   1,     0,   0,   0,      0,      0,    1 },
-  /* GNUCXX14 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   0,      1,   1,     0,   0,   0,      0,      0,    1 },
-  /* CXX14    */  { 1,  1,  0,  1,  1,  1,  1,   1,   1,   1,    1,     1,     1,   0,      0,   1,     0,   0,   0,      0,      0,    1 },
-  /* GNUCXX17 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0,      0,    1 },
-  /* CXX17    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      0,   1,     0,   0,   0,      0,      0,    1 },
-  /* GNUCXX20 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0,      0,    1 },
-  /* CXX20    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0,      0,    1 },
-  /* GNUCXX23 */  { 1,  1,  1,  1,  1,  0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1,      1,      1,    1 },
-  /* CXX23    */  { 1,  1,  1,  1,  1,  1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1,      1,      1,    1 },
-  /* ASM      */  { 0,  0,  1,  0,  0,  0,  0,   0,   0,   0,    0,     0,     0,   0,      0,   0,     0,   0,   0,      0,      0,    0 }
+{ /*              c99 c++ xnum xid c11 xidid std digr ulit rlit udlit bincst digsep trig u8chlit vaopt scope dfp szlit elifdef warndir delim trufal */
+  /* GNUC89   */  { 0,  0,  1,  0,  0,  0,    0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    0 },
+  /* GNUC99   */  { 1,  0,  1,  1,  0,  0,    0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    0 },
+  /* GNUC11   */  { 1,  0,  1,  1,  1,  0,    0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    0 },
+  /* GNUC17   */  { 1,  0,  1,  1,  1,  0,    0,  1,   1,   1,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    0 },
+  /* GNUC2X   */  { 1,  0,  1,  1,  1,  1,    0,  1,   1,   1,   0,    1,     1,     0,   1,      1,   1,     1,   0,   1,      1,      0,    1 },
+  /* STDC89   */  { 0,  0,  0,  0,  0,  0,    1,  0,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0,      0,    0 },
+  /* STDC94   */  { 0,  0,  0,  0,  0,  0,    1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0,      0,    0 },
+  /* STDC99   */  { 1,  0,  1,  1,  0,  0,    1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0,      0,    0 },
+  /* STDC11   */  { 1,  0,  1,  1,  1,  0,    1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0,      0,    0 },
+  /* STDC17   */  { 1,  0,  1,  1,  1,  0,    1,  1,   1,   0,   0,    0,     0,     1,   0,      0,   0,     0,   0,   0,      0,      0,    0 },
+  /* STDC2X   */  { 1,  0,  1,  1,  1,  1,    1,  1,   1,   0,   0,    1,     1,     0,   1,      0,   1,     1,   0,   1,      1,      0,    1 },
+  /* GNUCXX   */  { 0,  1,  1,  1,  0,  1,    0,  1,   0,   0,   0,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    1 },
+  /* CXX98    */  { 0,  1,  0,  1,  0,  1,    1,  1,   0,   0,   0,    0,     0,     1,   0,      0,   1,     0,   0,   0,      0,      0,    1 },
+  /* GNUCXX11 */  { 1,  1,  1,  1,  1,  1,    0,  1,   1,   1,   1,    0,     0,     0,   0,      1,   1,     0,   0,   0,      0,      0,    1 },
+  /* CXX11    */  { 1,  1,  0,  1,  1,  1,    1,  1,   1,   1,   1,    0,     0,     1,   0,      0,   1,     0,   0,   0,      0,      0,    1 },
+  /* GNUCXX14 */  { 1,  1,  1,  1,  1,  1,    0,  1,   1,   1,   1,    1,     1,     0,   0,      1,   1,     0,   0,   0,      0,      0,    1 },
+  /* CXX14    */  { 1,  1,  0,  1,  1,  1,    1,  1,   1,   1,   1,    1,     1,     1,   0,      0,   1,     0,   0,   0,      0,      0,    1 },
+  /* GNUCXX17 */  { 1,  1,  1,  1,  1,  1,    0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0,      0,    1 },
+  /* CXX17    */  { 1,  1,  1,  1,  1,  1,    1,  1,   1,   1,   1,    1,     1,     0,   1,      0,   1,     0,   0,   0,      0,      0,    1 },
+  /* GNUCXX20 */  { 1,  1,  1,  1,  1,  1,    0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0,      0,    1 },
+  /* CXX20    */  { 1,  1,  1,  1,  1,  1,    1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   0,   0,      0,      0,    1 },
+  /* GNUCXX23 */  { 1,  1,  1,  1,  1,  1,    0,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1,      1,      1,    1 },
+  /* CXX23    */  { 1,  1,  1,  1,  1,  1,    1,  1,   1,   1,   1,    1,     1,     0,   1,      1,   1,     0,   1,   1,      1,      1,    1 },
+  /* ASM      */  { 0,  0,  1,  0,  0,  0,    0,  0,   0,   0,   0,    0,     0,     0,   0,      0,   0,     0,   0,   0,      0,      0,    0 }
 };
 
 /* Sets internal flags correctly for a given language.  */
@@ -142,6 +143,7 @@ cpp_set_lang (cpp_reader *pfile, enum c_lang lang)
   CPP_OPTION (pfile, extended_numbers)		 = l->extended_numbers;
   CPP_OPTION (pfile, extended_identifiers)	 = l->extended_identifiers;
   CPP_OPTION (pfile, c11_identifiers)		 = l->c11_identifiers;
+  CPP_OPTION (pfile, xid_identifiers)		 = l->xid_identifiers;
   CPP_OPTION (pfile, std)			 = l->std;
   CPP_OPTION (pfile, digraphs)			 = l->digraphs;
   CPP_OPTION (pfile, uliterals)			 = l->uliterals;
diff --git a/libcpp/lex.cc b/libcpp/lex.cc
index a429a3d44ce..cc12a52d282 100644
--- a/libcpp/lex.cc
+++ b/libcpp/lex.cc
@@ -2007,7 +2007,8 @@ name_p (cpp_reader *pfile, const cpp_string *string)
 static void
 warn_about_normalization (cpp_reader *pfile, 
 			  const cpp_token *token,
-			  const struct normalize_state *s)
+			  const struct normalize_state *s,
+			  bool identifier)
 {
   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
       && !pfile->state.skipping)
@@ -2043,7 +2044,7 @@ warn_about_normalization (cpp_reader *pfile,
       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
 	cpp_warning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
 			"`%.*s' is not in NFKC", (int) sz, buf);
-      else if (CPP_OPTION (pfile, cplusplus))
+      else if (identifier && CPP_OPTION (pfile, xid_identifiers))
 	cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, &rich_loc,
 				  "`%.*s' is not in NFC", (int) sz, buf);
       else
@@ -3839,7 +3840,7 @@ _cpp_lex_direct (cpp_reader *pfile)
 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
 	result->type = CPP_NUMBER;
 	lex_number (pfile, &result->val.str, &nst);
-	warn_about_normalization (pfile, result, &nst);
+	warn_about_normalization (pfile, result, &nst, false);
 	break;
       }
 
@@ -3888,7 +3889,7 @@ _cpp_lex_direct (cpp_reader *pfile)
 	result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
 						&nst,
 						&result->val.node.spelling);
-	warn_about_normalization (pfile, result, &nst);
+	warn_about_normalization (pfile, result, &nst, true);
       }
 
       /* Convert named operators to their proper types.  */
@@ -4101,7 +4102,7 @@ _cpp_lex_direct (cpp_reader *pfile)
 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
 	  result->type = CPP_NUMBER;
 	  lex_number (pfile, &result->val.str, &nst);
-	  warn_about_normalization (pfile, result, &nst);
+	  warn_about_normalization (pfile, result, &nst, false);
 	}
       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
@@ -4192,7 +4193,7 @@ _cpp_lex_direct (cpp_reader *pfile)
 	    result->type = CPP_NAME;
 	    result->val.node.node = lex_identifier (pfile, base, true, &nst,
 						    &result->val.node.spelling);
-	    warn_about_normalization (pfile, result, &nst);
+	    warn_about_normalization (pfile, result, &nst, true);
 	    break;
 	  }
 

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-10-14 23:10 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-14 23:10 [committed] preprocessor: C2x identifier rules Joseph Myers

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).