commit c4260c7c49822522945377cc2fb93ee9830cefc8 Author: Tom Honermann Date: Sat Feb 13 09:02:34 2021 -0500 N2653 char8_t for C: Language support This patch implements the core language and compiler dependent library changes proposed in WG14 N2653 for C. The changes include: - Use of the existing -fchar8_t and -fno-char8_t options to opt-in to (or opt-out of) the following changes when compiling C code. - Change of type for UTF-8 string literals from array of const char to array of const char8_t (unsigned char). - A new atomic_char8_t typedef. - A new ATOMIC_CHAR8_T_LOCK_FREE macro defined in terms of a new predefined ATOMIC_CHAR8_T_LOCK_FREE macro. When -fchar8_t support is enabled for non-C++ modes, the _CHAR8_T_SOURCE macro is predefined. This is the mechanism proposed to glibc to opt-in to declarations of the char8_t typedef and c8rtomb and mbrtoc8 functions proposed in N2653. diff --git a/gcc/c-family/c-cppbuiltin.c b/gcc/c-family/c-cppbuiltin.c index 42b7604c9ac..3e944ec2b86 100644 --- a/gcc/c-family/c-cppbuiltin.c +++ b/gcc/c-family/c-cppbuiltin.c @@ -1467,6 +1467,11 @@ c_cpp_builtins (cpp_reader *pfile) if (flag_iso) cpp_define (pfile, "__STRICT_ANSI__"); + /* Express intent for char8_t support in C (not C++) to the C library if + requested. */ + if (!c_dialect_cxx () && flag_char8_t) + cpp_define (pfile, "_CHAR8_T_SOURCE"); + if (!flag_signed_char) cpp_define (pfile, "__CHAR_UNSIGNED__"); diff --git a/gcc/c-family/c-lex.c b/gcc/c-family/c-lex.c index c44e7a13489..e30e44e9f5c 100644 --- a/gcc/c-family/c-lex.c +++ b/gcc/c-family/c-lex.c @@ -1335,7 +1335,14 @@ lex_string (const cpp_token *tok, tree *valp, bool objc_string, bool translate) default: case CPP_STRING: case CPP_UTF8STRING: - value = build_string (1, ""); + if (type == CPP_UTF8STRING && flag_char8_t) + { + value = build_string (TYPE_PRECISION (char8_type_node) + / TYPE_PRECISION (char_type_node), + ""); /* char8_t is 8 bits */ + } + else + value = build_string (1, ""); break; case CPP_STRING16: value = build_string (TYPE_PRECISION (char16_type_node) diff --git a/gcc/c-family/c-opts.c b/gcc/c-family/c-opts.c index 60b5802722c..eefc607dac6 100644 --- a/gcc/c-family/c-opts.c +++ b/gcc/c-family/c-opts.c @@ -718,6 +718,10 @@ c_common_handle_option (size_t scode, const char *arg, HOST_WIDE_INT value, case OPT_v: verbose = true; break; + + case OPT_fchar8_t: + cpp_opts->char8 = value; + break; } switch (c_language) diff --git a/gcc/c-family/c.opt b/gcc/c-family/c.opt index 91929706aff..eadb2468aa9 100644 --- a/gcc/c-family/c.opt +++ b/gcc/c-family/c.opt @@ -1451,8 +1451,8 @@ C ObjC C++ ObjC++ Where shorter, use canonicalized paths to systems headers. fchar8_t -C++ ObjC++ Var(flag_char8_t) Init(-1) -Enable the char8_t fundamental type and use it as the type for UTF-8 string +C ObjC C++ ObjC++ Var(flag_char8_t) Init(-1) +Enable the char8_t type and use it as the type for UTF-8 string and character literals. fcheck-pointer-bounds diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c index d71fd0abe90..501253d0ffe 100644 --- a/gcc/c/c-parser.c +++ b/gcc/c/c-parser.c @@ -7425,7 +7425,14 @@ c_parser_string_literal (c_parser *parser, bool translate, bool wide_ok) default: case CPP_STRING: case CPP_UTF8STRING: - value = build_string (1, ""); + if (type == CPP_UTF8STRING && flag_char8_t) + { + value = build_string (TYPE_PRECISION (char8_type_node) + / TYPE_PRECISION (char_type_node), + ""); /* char8_t is 8 bits */ + } + else + value = build_string (1, ""); break; case CPP_STRING16: value = build_string (TYPE_PRECISION (char16_type_node) @@ -7450,9 +7457,14 @@ c_parser_string_literal (c_parser *parser, bool translate, bool wide_ok) { default: case CPP_STRING: - case CPP_UTF8STRING: TREE_TYPE (value) = char_array_type_node; break; + case CPP_UTF8STRING: + if (flag_char8_t) + TREE_TYPE (value) = char8_array_type_node; + else + TREE_TYPE (value) = char_array_type_node; + break; case CPP_STRING16: TREE_TYPE (value) = char16_array_type_node; break; diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c index 5f322874423..1fa95949919 100644 --- a/gcc/c/c-typeck.c +++ b/gcc/c/c-typeck.c @@ -7979,7 +7979,8 @@ digest_init (location_t init_loc, tree type, tree init, tree origtype, if (char_array) { - if (typ2 != char_type_node) + if (typ2 != char_type_node + && typ2 != unsigned_char_type_node) /* char8_t literal */ incompat_string_cst = true; } else if (!comptypes (typ1, typ2)) diff --git a/gcc/ginclude/stdatomic.h b/gcc/ginclude/stdatomic.h index 23c07be2a48..6629902a666 100644 --- a/gcc/ginclude/stdatomic.h +++ b/gcc/ginclude/stdatomic.h @@ -49,6 +49,9 @@ typedef _Atomic long atomic_long; typedef _Atomic unsigned long atomic_ulong; typedef _Atomic long long atomic_llong; typedef _Atomic unsigned long long atomic_ullong; +#if defined(_CHAR8_T_SOURCE) +typedef _Atomic __CHAR8_TYPE__ atomic_char8_t; +#endif typedef _Atomic __CHAR16_TYPE__ atomic_char16_t; typedef _Atomic __CHAR32_TYPE__ atomic_char32_t; typedef _Atomic __WCHAR_TYPE__ atomic_wchar_t; @@ -97,6 +100,9 @@ extern void atomic_signal_fence (memory_order); #define ATOMIC_BOOL_LOCK_FREE __GCC_ATOMIC_BOOL_LOCK_FREE #define ATOMIC_CHAR_LOCK_FREE __GCC_ATOMIC_CHAR_LOCK_FREE +#if defined(_CHAR8_T_SOURCE) +#define ATOMIC_CHAR8_T_LOCK_FREE __GCC_ATOMIC_CHAR8_T_LOCK_FREE +#endif #define ATOMIC_CHAR16_T_LOCK_FREE __GCC_ATOMIC_CHAR16_T_LOCK_FREE #define ATOMIC_CHAR32_T_LOCK_FREE __GCC_ATOMIC_CHAR32_T_LOCK_FREE #define ATOMIC_WCHAR_T_LOCK_FREE __GCC_ATOMIC_WCHAR_T_LOCK_FREE diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index 7e840635a38..4c90f8bbbda 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -358,6 +358,9 @@ struct cpp_options /* Nonzero means process u8 prefixed character literals (UTF-8). */ unsigned char utf8_char_literals; + /* Nonzero means char8_t support is enabled. */ + unsigned char char8; + /* Nonzero means process r/R raw strings. If this is set, uliterals must be set as well. */ unsigned char rliterals;