2022-08-25 Jakub Jelinek PR c++/106648 libcpp/ * charset.cc - Implement C++23 P2071R2 - Named universal character escapes. Include uname2c.h. (hangul_syllables, hangul_count): New variables. (struct uname2c_data): New type. (_cpp_uname2c, _cpp_uname2c_uax44_lm2): New functions. (_cpp_valid_ucn): Use them. Handle named universal character escapes. (convert_ucn): Adjust comment. (convert_escape): Call convert_ucn even for \N. (_cpp_interpret_identifier): Handle named universal character escapes. * lex.cc (get_bidi_ucn): Fix up function comment formatting. (get_bidi_named): New function. (forms_identifier_p, lex_string): Handle named universal character escapes. * makeuname2c.cc: New file. * uname2c.h: New generated file. gcc/testsuite/ * c-c++-common/cpp/named-universal-char-escape-1.c: New test. * c-c++-common/cpp/named-universal-char-escape-2.c: New test. * c-c++-common/cpp/named-universal-char-escape-3.c: New test. * c-c++-common/cpp/named-universal-char-escape-4.c: New test. * c-c++-common/Wbidi-chars-25.c: New test. * gcc.dg/cpp/named-universal-char-escape-1.c: New test. * gcc.dg/cpp/named-universal-char-escape-2.c: New test. * g++.dg/cpp/named-universal-char-escape-1.C: New test. * g++.dg/cpp/named-universal-char-escape-2.C: New test. --- libcpp/charset.cc.jj 2022-08-22 11:17:40.674385078 +0200 +++ libcpp/charset.cc 2022-08-25 10:34:16.652212078 +0200 @@ -921,6 +921,342 @@ struct ucnrange { /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */ #define UCS_LIMIT 0x10FFFF +#include "uname2c.h" + +static const char hangul_syllables[][4] = { + /* L */ + "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "", + "J", "JJ", "C", "K", "T", "P", "H", + /* V */ + "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE", + "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I", + /* T */ + "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB", + "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", + "K", "T", "P", "H" +}; + +static const short hangul_count[6] = { 19, 21, 28 }; + +/* Used for Unicode loose matching rule UAX44-LM2 matching. */ + +struct uname2c_data +{ + char *canon_name; + char prev_char; +}; + +/* Map NAME, a Unicode character name or correction/control/alternate + alias, to a Unicode codepoint, or return (cppchar_t) -1 if + not found. This uses a space optimized radix tree precomputed + by the makeuname2c utility, with binary format documented in its + source makeuname2c.cc. */ + +static cppchar_t +_cpp_uname2c (const char *name, size_t len, const unsigned char *n, + struct uname2c_data *data) +{ + do + { + char k; + const char *key; + size_t key_len, len_adj; + bool has_value = *n & 0x40; + bool has_children, no_sibling = false; + cppchar_t codepoint = -1; + const unsigned char *child = NULL; + int ret; + + if (*n & 0x80) + { + k = ' ' + (*n++ & 0x3f); + key = &k; + key_len = 1; + } + else + { + key_len = *n++ & 0x3f; + key = &uname2c_dict[*n++]; + key += (*n++ << 8); + } + if (has_value) + { + codepoint = *n + (n[1] << 8) + ((n[2] & 0x1f) << 16); + has_children = n[2] & 0x80; + no_sibling = n[2] & 0x40; + n += 3; + } + else + has_children = true; + if (has_children) + { + unsigned int shift = 0; + size_t child_off = 0; + + do + { + child_off |= (*n & 0x7f) << shift; + shift += 7; + } + while ((*n++ & 0x80) != 0); + child = n + child_off; + } + if (__builtin_expect (data == NULL, 1)) + { + ret = memcmp (name, key, len > key_len ? key_len : len); + len_adj = key_len; + } + else + { + const char *p = name, *q = key; + + while (1) + { + if ((size_t) (p - name) == len || (size_t) (q - key) == key_len) + break; + if (*q == ' ') + { + ++q; + continue; + } + if (*q == '-') + { + /* This is the hard case. Only medial hyphens + should be removed, where medial means preceded + and followed by alnum. */ + if (ISALNUM (q == key ? data->prev_char : q[-1])) + { + if (q + 1 == key + key_len) + { + /* We don't know what the next letter will be. + It could be ISALNUM, then we are supposed + to omit it, or it could be a space and then + we should not omit it and need to compare it. + Fortunately the only 3 names with hyphen + followed by non-letter are + U+0F0A TIBETAN MARK BKA- SHOG YIG MGO + U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN + U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN + Furthermore, prefixes of NR2 generated + ranges all end with a hyphen, but the generated + part is then followed by alpha-numeric. + So, let's just assume that - at the end of + key is always followed by alphanumeric and + so should be omitted. + makeuname2c.cc verifies that this is true. */ + ++q; + continue; + } + else if (ISALNUM (q[1])) + { + ++q; + continue; + } + } + } + if (*p != *q) + break; + ++p; + ++q; + } + len_adj = p - name; + /* If we don't consume the whole key, signal a mismatch, + but always with ret = 1, so that we keep looking through + siblings. */ + ret = q < key + key_len; + } + if (ret < 0) + return -1; + else if (ret == 0) + { + if (len < len_adj) + return -1; + else if (codepoint >= 0xd800 + && codepoint < 0xd800 + ARRAY_SIZE (uname2c_generated)) + { + name += len_adj; + len -= len_adj; + if (codepoint == 0xd800) + { + /* NR1 - Hangul syllables. */ + size_t start = 0, end, i, j; + int this_len, max_len; + char winner[3]; + + for (i = 0; i < 3; ++i) + { + end = start + hangul_count[i]; + max_len = -1; + winner[i] = -1; + for (j = start; j < end; j++) + { + this_len = strlen (hangul_syllables[j]); + if (len >= (size_t) this_len + && this_len > max_len + && memcmp (name, hangul_syllables[j], + this_len) == 0) + { + max_len = this_len; + winner[i] = j - start; + } + } + if (max_len == -1) + return -1; + name += max_len; + len -= max_len; + start = end; + } + if (__builtin_expect (data != NULL, 0)) + { + memcpy (data->canon_name, key, key_len); + data->canon_name[key_len] = '\0'; + for (i = 0, start = 0; i < 3; ++i) + { + strcat (data->canon_name, + hangul_syllables[start + winner[i]]); + start += hangul_count[i]; + } + } + return (0xac00 + 21 * 28 * winner[0] + + 28 * winner[1] + winner[2]); + } + else + { + /* NR2 - prefix followed by hexadecimal codepoint. */ + const cppchar_t *p; + size_t i; + + if (len < 4 || len > 5) + return -1; + p = uname2c_pairs + uname2c_generated[codepoint - 0xd800]; + codepoint = 0; + for (i = 0; i < len; ++i) + { + codepoint <<= 4; + if (!ISXDIGIT (name[i])) + return -1; + codepoint += hex_value (name[i]); + } + for (; *p; p += 2) + if (codepoint < *p) + return -1; + else if (codepoint <= p[1]) + { + if (__builtin_expect (data != NULL, 0)) + { + memcpy (data->canon_name, key, key_len); + memcpy (data->canon_name + key_len, name, len); + data->canon_name[key_len + len] = '\0'; + } + return codepoint; + } + return -1; + } + } + else if (__builtin_expect (data != NULL, 0)) + { + if (len == len_adj) + { + memcpy (data->canon_name, key, key_len); + data->canon_name[key_len] = '\0'; + return codepoint; + } + if (has_children) + { + struct uname2c_data save = *data; + memcpy (data->canon_name, key, key_len); + data->canon_name += key_len; + data->prev_char = key[key_len - 1]; + codepoint = _cpp_uname2c (name + len_adj, len - len_adj, + child, data); + if (codepoint != (cppchar_t) -1) + return codepoint; + *data = save; + } + } + else if (len == len_adj) + return codepoint; + else if (!has_children) + return -1; + else + { + name += len_adj; + len -= len_adj; + n = child; + continue; + } + } + if (no_sibling || (!has_value && *n == 0xff)) + break; + } + while (1); + return -1; +} + +/* Try to do a loose name lookup according to Unicode loose matching rule + UAX44-LM2. First ignore medial hyphens, whitespace, underscore + characters and convert to upper case. */ + +static cppchar_t +_cpp_uname2c_uax44_lm2 (const char *name, size_t len, char *canon_name) +{ + char name_after_uax44_lm2[uname2c_max_name_len]; + char *q = name_after_uax44_lm2; + const char *p; + + for (p = name; p < name + len; p++) + if (*p == '_' || *p == ' ') + continue; + else if (*p == '-' && p != name && ISALNUM (p[-1]) && ISALNUM (p[1])) + continue; + else if (q == name_after_uax44_lm2 + uname2c_max_name_len) + return -1; + else if (ISLOWER (*p)) + *q++ = TOUPPER (*p); + else + *q++ = *p; + + struct uname2c_data data; + data.canon_name = canon_name; + data.prev_char = ' '; + /* Hangul Jungseong O- E after UAX44-LM2 should be HANGULJUNGSEONGO-E + and so should match U+1180. */ + if (q - name_after_uax44_lm2 == sizeof ("HANGULJUNGSEONGO-E") - 1 + && memcmp (name_after_uax44_lm2, "HANGULJUNGSEONGO-E", + sizeof ("HANGULJUNGSEONGO-E") - 1) == 0) + { + name_after_uax44_lm2[sizeof ("HANGULJUNGSEONGO") - 1] = 'E'; + --q; + } + cppchar_t result + = _cpp_uname2c (name_after_uax44_lm2, q - name_after_uax44_lm2, + uname2c_tree, &data); + + /* Unicode UAX44-LM2 exception: + U+116C HANGUL JUNGSEONG OE + U+1180 HANGUL JUNGSEONG O-E + We remove all medial hyphens when we shouldn't remote the U+1180 one. + The U+1180 entry sorts before U+116C lexicographilly, so we get U+1180 + in both cases. Thus, if result is U+1180, check if user's name doesn't + have a hyphen there and adjust. */ + if (result == 0x1180) + { + while (p[-1] == ' ' || p[-1] == '_') + --p; + gcc_assert (TOUPPER (p[-1]) == 'E'); + --p; + while (p[-1] == ' ' || p[-1] == '_') + --p; + if (p[-1] != '-') + { + result = 0x116c; + memcpy (canon_name + sizeof ("HANGUL JUNGSEONG O") - 1, "E", 2); + } + } + return result; +} + + /* Returns 1 if C is valid in an identifier, 2 if C is valid except at the start of an identifier, and 0 if C is not valid in an identifier. We assume C has already gone through the checks of @@ -1094,7 +1430,7 @@ _cpp_valid_ucn (cpp_reader *pfile, const unsigned int length; const uchar *str = *pstr; const uchar *base = str - 2; - bool delimited = false; + bool delimited = false, named = false; if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99)) cpp_error (pfile, CPP_DL_WARNING, @@ -1108,6 +1444,7 @@ _cpp_valid_ucn (cpp_reader *pfile, const "the meaning of '\\%c' is different in traditional C", (int) str[-1]); + result = 0; if (str[-1] == 'u') { length = 4; @@ -1122,44 +1459,130 @@ _cpp_valid_ucn (cpp_reader *pfile, const } else if (str[-1] == 'U') length = 8; - else + else if (str[-1] == 'N') { - cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN"); length = 4; - } - - result = 0; - do - { - if (str == limit) - break; - c = *str; - if (!ISXDIGIT (c)) - break; - str++; - extend_char_range (char_range, loc_reader); - if (delimited) + if (str == limit || *str != '{') + cpp_error (pfile, CPP_DL_ERROR, "'\\N' not followed by '{'"); + else { - if (!result) - /* Accept arbitrary number of leading zeros. - 16 is another magic value, smaller than 32 above - and bigger than 8, so that upon encountering first - non-zero digit we can count 8 digits and after that - or in overflow bit and ensure length doesn't decrease - to 0, as delimited escape sequence doesn't have upper - bound on the number of hex digits. */ - length = 16; - else if (length == 16 - 8) + str++; + named = true; + extend_char_range (char_range, loc_reader); + length = 0; + const uchar *name = str; + bool strict = true; + + do { - /* Make sure we detect overflows. */ - result |= 0x8000000; - ++length; + if (str == limit) + break; + c = *str; + if (!ISIDNUM (c) && c != ' ' && c != '-') + break; + if (ISLOWER (c) || c == '_') + strict = false; + str++; + extend_char_range (char_range, loc_reader); } - } + while (1); - result = (result << 4) + hex_value (c); + if (str < limit && *str == '}') + { + if (name == str && identifier_pos) + { + *cp = 0; + return false; + } + if (name == str) + cpp_error (pfile, CPP_DL_ERROR, + "empty named universal character escape sequence"); + else if (!CPP_OPTION (pfile, delimited_escape_seqs) + && CPP_OPTION (pfile, cpp_pedantic)) + cpp_error (pfile, CPP_DL_PEDWARN, + "named universal character escapes are only valid " + "in C++23"); + if (name == str) + result = 0x40; + else + { + /* If the name is longer than maximum length of a Unicode + name, it can't be strictly valid. */ + if ((size_t) (str - name) > uname2c_max_name_len || !strict) + result = -1; + else + result = _cpp_uname2c ((const char *) name, str - name, + uname2c_tree, NULL); + if (result == (cppchar_t) -1) + { + cpp_error (pfile, CPP_DL_ERROR, + "\\N{%.*s} is not a valid universal " + "character", (int) (str - name), name); + + /* Try to do a loose name lookup according to + Unicode loose matching rule UAX44-LM2. */ + char canon_name[uname2c_max_name_len + 1]; + result = _cpp_uname2c_uax44_lm2 ((const char *) name, + str - name, canon_name); + if (result != (cppchar_t) -1) + cpp_error (pfile, CPP_DL_NOTE, + "did you mean \\N{%s}?", canon_name); + else + result = 0x40; + } + } + str++; + extend_char_range (char_range, loc_reader); + } + else if (identifier_pos) + length = 1; + else + { + cpp_error (pfile, CPP_DL_ERROR, + "'\\N{' not terminated with '}' after %.*s", + (int) (str - base), base); + result = 1; + } + } } - while (--length); + else + { + cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN"); + length = 4; + } + + if (!named) + do + { + if (str == limit) + break; + c = *str; + if (!ISXDIGIT (c)) + break; + str++; + extend_char_range (char_range, loc_reader); + if (delimited) + { + if (!result) + /* Accept arbitrary number of leading zeros. + 16 is another magic value, smaller than 32 above + and bigger than 8, so that upon encountering first + non-zero digit we can count 8 digits and after that + or in overflow bit and ensure length doesn't decrease + to 0, as delimited escape sequence doesn't have upper + bound on the number of hex digits. */ + length = 16; + else if (length == 16 - 8) + { + /* Make sure we detect overflows. */ + result |= 0x8000000; + ++length; + } + } + + result = (result << 4) + hex_value (c); + } + while (--length); if (delimited && str < limit @@ -1274,7 +1697,7 @@ convert_ucn (cpp_reader *pfile, const uc /* loc_reader and ranges must either be both NULL, or both be non-NULL. */ gcc_assert ((loc_reader != NULL) == (ranges != NULL)); - from++; /* Skip u/U. */ + from++; /* Skip u/U/N. */ /* The u/U is part of the spelling of this character. */ extend_char_range (&char_range, loc_reader); @@ -1665,7 +2088,7 @@ convert_escape (cpp_reader *pfile, const switch (c) { /* UCNs, hex escapes, and octal escapes are processed separately. */ - case 'u': case 'U': + case 'u': case 'U': case 'N': return convert_ucn (pfile, from, limit, tbuf, cvt, char_range, loc_reader, ranges); @@ -2256,31 +2679,47 @@ _cpp_interpret_identifier (cpp_reader *p *bufp++ = id[idp]; else { - unsigned length = id[idp+1] == 'u' ? 4 : 8; + unsigned length = id[idp + 1] == 'u' ? 4 : 8; cppchar_t value = 0; size_t bufleft = len - (bufp - buf); int rval; bool delimited = false; idp += 2; - if (length == 4 && id[idp] == '{') + if (id[idp - 1] == 'N' && id[idp] == '{') { - delimited = true; idp++; + const uchar *name = &id[idp]; + while (idp < len + && (ISIDNUM (id[idp]) || id[idp] == ' ' || id[idp] == '-')) + idp++; + if (id[idp] == '}') + { + value = _cpp_uname2c ((const char *) name, &id[idp] - name, + uname2c_tree, NULL); + if (value == (cppchar_t) -1) + value = 1; + } + else + idp--; } - while (length && idp < len && ISXDIGIT (id[idp])) + else { - value = (value << 4) + hex_value (id[idp]); - idp++; - if (!delimited) - length--; + if (length == 4 && id[idp] == '{') + { + delimited = true; + idp++; + } + while (length && idp < len && ISXDIGIT (id[idp])) + { + value = (value << 4) + hex_value (id[idp]); + idp++; + if (!delimited) + length--; + } + if (!delimited || id[idp] != '}') + idp--; } - if (!delimited) - idp--; - /* else - assert (id[idp] == '}'); - As the caller ensures it is a valid identifier, if it is - delimited escape sequence, it must be terminated by }. */ /* Special case for EBCDIC: if the identifier contains a '$' specified using a UCN, translate it to EBCDIC. */ --- libcpp/lex.cc.jj 2022-08-22 11:17:40.761383904 +0200 +++ libcpp/lex.cc 2022-08-25 10:04:15.165363369 +0200 @@ -1512,7 +1512,7 @@ get_bidi_ucn_1 (const unsigned char *p, } /* Parse a UCN where P points just past \u or \U and return its bidi code. - If the kind is not NONE, write the location to *OUT.*/ + If the kind is not NONE, write the location to *OUT. */ static bidi::kind get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U, @@ -1529,6 +1529,56 @@ get_bidi_ucn (cpp_reader *pfile, const u return result; } +/* Parse a named universal character escape where P points just past \N and + return its bidi code. If the kind is not NONE, write the location to + *OUT. */ + +static bidi::kind +get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out) +{ + bidi::kind result = bidi::kind::NONE; + if (*p != '{') + return bidi::kind::NONE; + if (strncmp ((const char *) (p + 1), "LEFT-TO-RIGHT ", 14) == 0) + { + if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0) + result = bidi::kind::LTR; + else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0) + result = bidi::kind::LRE; + else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0) + result = bidi::kind::LRO; + else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0) + result = bidi::kind::LRI; + } + else if (strncmp ((const char *) (p + 1), "RIGHT-TO-LEFT ", 14) == 0) + { + if (strncmp ((const char *) (p + 15), "MARK}", 5) == 0) + result = bidi::kind::RTL; + else if (strncmp ((const char *) (p + 15), "EMBEDDING}", 10) == 0) + result = bidi::kind::RLE; + else if (strncmp ((const char *) (p + 15), "OVERRIDE}", 9) == 0) + result = bidi::kind::RLO; + else if (strncmp ((const char *) (p + 15), "ISOLATE}", 8) == 0) + result = bidi::kind::RLI; + } + else if (strncmp ((const char *) (p + 1), "POP DIRECTIONAL ", 16) == 0) + { + if (strncmp ((const char *) (p + 16), "FORMATTING}", 11) == 0) + result = bidi::kind::PDF; + else if (strncmp ((const char *) (p + 16), "ISOLATE}", 8) == 0) + result = bidi::kind::PDI; + } + else if (strncmp ((const char *) (p + 1), "FIRST STRONG ISOLATE}", 21) == 0) + result = bidi::kind::FSI; + if (result != bidi::kind::NONE) + *out = get_location_for_byte_range_in_cur_line (pfile, p - 2, + (strchr ((const char *) + (p + 1), '}') + - (const char *) p) + + 3); + return result; +} + /* Subclass of rich_location for reporting on unpaired UTF-8 bidirectional control character(s). Escape the source lines on output, and show all unclosed @@ -1914,16 +1964,20 @@ forms_identifier_p (cpp_reader *pfile, i return true; } else if (*buffer->cur == '\\' - && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) + && (buffer->cur[1] == 'u' + || buffer->cur[1] == 'U' + || buffer->cur[1] == 'N')) { buffer->cur += 2; if (warn_bidi_p) { location_t loc; - bidi::kind kind = get_bidi_ucn (pfile, - buffer->cur, - buffer->cur[-1] == 'U', - &loc); + bidi::kind kind; + if (buffer->cur[-1] == 'N') + kind = get_bidi_named (pfile, buffer->cur, &loc); + else + kind = get_bidi_ucn (pfile, buffer->cur, + buffer->cur[-1] == 'U', &loc); maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc); } if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, @@ -2657,11 +2711,14 @@ lex_string (cpp_reader *pfile, cpp_token /* In #include-style directives, terminators are not escapable. */ if (c == '\\' && !pfile->state.angled_headers && *cur != '\n') { - if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p) + if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p) { location_t loc; - bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', - &loc); + bidi::kind kind; + if (cur[0] == 'N') + kind = get_bidi_named (pfile, cur + 1, &loc); + else + kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U', &loc); maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc); } cur++; --- libcpp/makeuname2c.cc.jj 2022-08-25 10:04:15.166363356 +0200 +++ libcpp/makeuname2c.cc 2022-08-25 10:04:15.166363356 +0200 @@ -0,0 +1,793 @@ +/* Make uname2c.h from various sources. + Copyright (C) 2005-2022 Free Software Foundation, Inc. + Contributed by Jakub Jelinek + +This program is free software; you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the +Free Software Foundation; either version 3, or (at your option) any +later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; see the file COPYING3. If not see +. */ + +/* Run this program as + ./makeuname2c UnicodeData.txt NameAliases.txt > uname2c.h + + This program generates 2 big arrays and 2 small ones. + The large ones are uname2c_dict, initialized by string literal + representing dictionary, and uname2c_tree, which is a space optimized + radix tree. + The format of the radix tree is: + byte 0 either 0x80 + (key[0] - ' ') (if key_len == 1) + or key_len (otherwise) + either of them ored with 0x40 if it has a codepoint + byte 1 LSB of offset into uname2c_dict for key (only if key_len > 1) + byte 2 MSB of offset into uname2c_dict for key (only if key_len > 1) + if key_len == 1, the above 2 bytes are omitted + byte 3 LSB of codepoint (only if it has a codepoint) + byte 4 middle byte of codepoint (ditto) + byte 5 MSB of codepoint (ditto), ored with 0x80 if node has children + ored with 0x40 if it doesn't have siblings + if it doesn't have a codepoint, the above 3 bytes are omitted + and we assume that the node has children + byte 6, 7, 8 uleb128 encoded offset to first child relative to the end + of the uleb128 (only if node has children) + byte 9 0xff (only if node doesn't have a codepoint and doesn't + have siblings) + + For prefixes of Unicode NR1 or NR2 rule generated names, on a node + representing end of the prefix codepoint is 0xd800 + index into + uname2c_generated array with indexes into uname2c_pairs array of + code points (low, high) of the ranges terminated by single 0. + 0xd800 is NR1 rule (Hangul syllables), rest are NR2 rules. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0])) + +#define NUM_CODE_POINTS 0x110000 +#define MAX_CODE_POINT 0x10ffff +#define NO_VALUE 0xdc00 +#define GENERATED 0xd800 + +struct entry { const char *name; unsigned long codepoint; }; +static struct entry *entries; +static unsigned long num_allocated, num_entries; + +/* Unicode 14 Table 4-8. */ +struct generated { + const char *prefix; + /* max_high is a workaround for UnicodeData.txt inconsistencies + on a few CJK UNIFIED IDEOGRAPH- ranges where the "*, Last>" + entry is a few code points above the end of the range. */ + unsigned long low, high, max_high; + int idx, ok; +}; +static struct generated generated_ranges[] = +{ { "HANGUL SYLLABLE ", 0xac00, 0xd7a3, 0, 0, 0 }, /* NR1 rule */ + { "CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4dbf, 0, 1, 0 }, /* NR2 rules */ + { "CJK UNIFIED IDEOGRAPH-", 0x4e00, 0x9ffc, 0x9fff, 1, 0 }, + { "CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2a6dd, 0x2a6df, 1, 0 }, + { "CJK UNIFIED IDEOGRAPH-", 0x2a700, 0x2b734, 0x2b738, 1, 0 }, + { "CJK UNIFIED IDEOGRAPH-", 0x2b740, 0x2b81d, 0, 1, 0 }, + { "CJK UNIFIED IDEOGRAPH-", 0x2b820, 0x2cea1, 0, 1, 0 }, + { "CJK UNIFIED IDEOGRAPH-", 0x2ceb0, 0x2ebe0, 0, 1, 0 }, + { "CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134a, 0, 1, 0 }, + { "TANGUT IDEOGRAPH-", 0x17000, 0x187f7, 0, 2, 0 }, + { "TANGUT IDEOGRAPH-", 0x18d00, 0x18d08, 0, 2, 0 }, + { "KHITAN SMALL SCRIPT CHARACTER-", 0x18b00, 0x18cd5, 0, 3, 0 }, + { "NUSHU CHARACTER-", 0x1b170, 0x1b2fb, 0, 4, 0 }, + { "CJK COMPATIBILITY IDEOGRAPH-", 0xf900, 0xfa6d, 0, 5, 0 }, + { "CJK COMPATIBILITY IDEOGRAPH-", 0xfa70, 0xfad9, 0, 5, 0 }, + { "CJK COMPATIBILITY IDEOGRAPH-", 0x2f800, 0x2fa1d, 0, 5, 0 } +}; + +struct node { + struct node *sibling, *child; + const char *key; + size_t key_len, key_idx, node_size, size_sum, child_off; + unsigned long codepoint; + bool in_dict; +}; +static struct node *root, **nodes; +static unsigned long num_nodes; +static size_t dict_size, tree_size, max_entry_len; +static char *dict; +static unsigned char *tree; + +/* Die! */ + +static void +fail (const char *s, ...) +{ + va_list ap; + + va_start (ap, s); + vfprintf (stderr, s, ap); + va_end (ap); + fputc ('\n', stderr); + exit (1); +} + +static void * +xmalloc (size_t size) +{ + void *ret = malloc (size); + + if (ret == NULL) + fail ("failed to allocate %ld bytes", (long) size); + return ret; +} + +static void * +xrealloc (void *p, size_t size) +{ + void *ret = p ? realloc (p, size) : malloc (size); + + if (ret == NULL) + fail ("failed to allocate %ld bytes", (long) size); + return ret; +} + +static int +entrycmp (const void *p1, const void *p2) +{ + const struct entry *e1 = (const struct entry *) p1; + const struct entry *e2 = (const struct entry *) p2; + int ret = strcmp (e1->name, e2->name); + + if (ret != 0) + return ret; + if (e1->codepoint < e2->codepoint) + return -1; + if (e1->codepoint > e2->codepoint) + return 1; + return 0; +} + +static int +nodecmp (const void *p1, const void *p2) +{ + const struct node *n1 = *(const struct node *const *) p1; + const struct node *n2 = *(const struct node *const *) p2; + if (n1->key_len > n2->key_len) + return -1; + if (n1->key_len < n2->key_len) + return 1; + return memcmp (n1->key, n2->key, n1->key_len); +} + +/* Read UnicodeData.txt and fill in the 'decomp' table to be the + decompositions of characters for which both the character + decomposed and all the code points in the decomposition are valid + for some supported language version, and the 'all_decomp' table to + be the decompositions of all characters without those + constraints. */ + +static void +read_table (char *fname, bool aliases_p) +{ + FILE *f = fopen (fname, "r"); + const char *sname = aliases_p ? "NameAliases.txt" : "UnicodeData.txt"; + + if (!f) + fail ("opening %s", sname); + for (;;) + { + char line[256]; + unsigned long codepoint; + const char *name, *aname; + char *l; + size_t i; + + if (!fgets (line, sizeof (line), f)) + break; + codepoint = strtoul (line, &l, 16); + if (l == line && aliases_p) + { + /* NameAliased.txt can contain comments and empty lines. */ + if (*line == '#' || *line == '\n') + continue; + } + if (l == line || *l != ';') + fail ("parsing %s, reading code point", sname); + if (codepoint > MAX_CODE_POINT) + fail ("parsing %s, code point too large", sname); + + name = l + 1; + do { + ++l; + } while (*l != ';'); + + aname = NULL; + if (aliases_p) + { + /* Ignore figment and abbreviation aliases. */ + if (strcmp (l + 1, "correction\n") != 0 + && strcmp (l + 1, "control\n") != 0 + && strcmp (l + 1, "alternate\n") != 0) + continue; + i = ARRAY_SIZE (generated_ranges); + } + else + { + for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) + if (codepoint >= generated_ranges[i].low + && codepoint <= generated_ranges[i].max_high) + break; + if (i != ARRAY_SIZE (generated_ranges)) + { + if (*name == '<' && l[-1] == '>') + { + if (codepoint == generated_ranges[i].low + && l - name >= 9 + && memcmp (l - 8, ", First>", 8) == 0 + && generated_ranges[i].ok == 0) + { + generated_ranges[i].ok = INT_MAX - 1; + aname = generated_ranges[i].prefix; + codepoint = GENERATED + generated_ranges[i].idx; + } + /* Unfortunately, UnicodeData.txt isn't consistent + with the Table 4-8 range endpoints in 3 cases, + the ranges are longer there by a few codepoints. + So use the max_high hack to avoid verification + failures. */ + else if (codepoint == generated_ranges[i].max_high + && l - name >= 8 + && memcmp (l - 7, ", Last>", 7) == 0 + && generated_ranges[i].ok == INT_MAX - 1) + { + generated_ranges[i].ok = INT_MAX; + continue; + } + else + fail ("unexpected generated entry %lx %.*s", + codepoint, (int) (l - name), name); + } + else if (codepoint + == generated_ranges[i].low + generated_ranges[i].ok + && l - name == (strlen (generated_ranges[i].prefix) + + (name - 1 - line)) + && memcmp (name, generated_ranges[i].prefix, + strlen (generated_ranges[i].prefix)) == 0 + && memcmp (name + strlen (generated_ranges[i].prefix), + line, name - 1 - line) == 0) + { + ++generated_ranges[i].ok; + if (codepoint != generated_ranges[i].low) + continue; + aname = generated_ranges[i].prefix; + codepoint = GENERATED + generated_ranges[i].idx; + } + else + fail ("unexpected generated entry %lx %.*s", + codepoint, (int) (l - name), name); + if (aname == generated_ranges[i].prefix) + { + size_t j; + + /* Don't add an entry for a generated range where the + same prefix has been added already. */ + for (j = 0; j < i; ++j) + if (generated_ranges[j].idx == generated_ranges[i].idx + && generated_ranges[j].ok != 0) + break; + if (j < i) + continue; + } + } + else if (*name == '<' && l[-1] == '>') + continue; + } + + if (num_entries == num_allocated) + { + num_allocated = num_allocated ? 2 * num_allocated : 65536; + entries = (struct entry *) xrealloc (entries, num_allocated + * sizeof (entries[0])); + } + + if (aname == NULL) + { + char *a = (char *) xmalloc (l + 1 - name); + if (l - name > max_entry_len) + max_entry_len = l - name; + memcpy (a, name, l - name); + a[l - name] = '\0'; + aname = a; + } + entries[num_entries].name = aname; + entries[num_entries++].codepoint = codepoint; + } + if (ferror (f)) + fail ("reading %s", sname); + fclose (f); +} + +/* Assumes nodes are added from sorted array, so we never + add any node before existing one, only after it. */ + +static void +node_add (struct node **p, const char *key, size_t key_len, + unsigned long codepoint) +{ + struct node *n; + size_t i; + + do + { + if (*p == NULL) + { + *p = n = (struct node *) xmalloc (sizeof (struct node)); + ++num_nodes; + assert (key_len); + n->sibling = NULL; + n->child = NULL; + n->key = key; + n->key_len = key_len; + n->codepoint = codepoint; + return; + } + n = *p; + for (i = 0; i < n->key_len && i < key_len; ++i) + if (n->key[i] != key[i]) + break; + if (i == 0) + { + p = &n->sibling; + continue; + } + if (i == n->key_len) + { + assert (key_len > n->key_len); + p = &n->child; + key += n->key_len; + key_len -= n->key_len; + continue; + } + /* Need to split the node. */ + assert (i < key_len); + n = (struct node *) xmalloc (sizeof (struct node)); + ++num_nodes; + n->sibling = NULL; + n->child = (*p)->child; + n->key = (*p)->key + i; + n->key_len = (*p)->key_len - i; + n->codepoint = (*p)->codepoint; + (*p)->child = n; + (*p)->key_len = i; + (*p)->codepoint = NO_VALUE; + key += i; + key_len -= i; + p = &n->sibling; + } + while (1); +} + +static void +append_nodes (struct node *n) +{ + for (; n; n = n->sibling) + { + nodes[num_nodes++] = n; + append_nodes (n->child); + } +} + +static size_t +sizeof_uleb128 (size_t val) +{ + size_t sz = 0; + do + { + val >>= 7; + sz += 1; + } + while (val != 0); + return sz; +} + +static void +size_nodes (struct node *n) +{ + if (n->child) + size_nodes (n->child); + if (n->sibling) + size_nodes (n->sibling); + n->node_size = 1 + (n->key_len > 1) * 2; + if (n->codepoint != NO_VALUE) + n->node_size += 3; + else if (n->sibling == NULL) + ++n->node_size; + n->size_sum = 0; + n->child_off = 0; + if (n->sibling) + n->size_sum += n->sibling->size_sum; + if (n->child) + { + n->child_off = n->size_sum + (n->codepoint == NO_VALUE + && n->sibling == NULL); + n->node_size += sizeof_uleb128 (n->child_off); + } + n->size_sum += n->node_size; + if (n->child) + n->size_sum += n->child->size_sum; + tree_size += n->node_size; +} + +static void +write_uleb128 (unsigned char *p, size_t val) +{ + unsigned char c; + do + { + c = val & 0x7f; + val >>= 7; + if (val) + c |= 0x80; + *p++ = c; + } + while (val); +} + +static void +write_nodes (struct node *n, size_t off) +{ + for (; n; n = n->sibling) + { + assert (tree[off] == 0 && off < tree_size); + if (n->key_len > 1) + { + assert (n->key_len < 64); + tree[off] = n->key_len; + } + else + tree[off] = (n->key[0] - ' ') | 0x80; + assert ((tree[off] & 0x40) == 0); + if (n->codepoint != NO_VALUE) + tree[off] |= 0x40; + off++; + if (n->key_len > 1) + { + tree[off++] = n->key_idx & 0xff; + tree[off++] = (n->key_idx >> 8) & 0xff; + } + if (n->codepoint != NO_VALUE) + { + assert (n->codepoint < (1L << 21)); + tree[off++] = n->codepoint & 0xff; + tree[off++] = (n->codepoint >> 8) & 0xff; + tree[off] = (n->codepoint >> 16) & 0xff; + if (n->child) + tree[off] |= 0x80; + if (!n->sibling) + tree[off] |= 0x40; + off++; + } + if (n->child) + { + write_uleb128 (&tree[off], n->child_off); + off += sizeof_uleb128 (n->child_off); + write_nodes (n->child, off + n->child_off); + } + if (n->codepoint == NO_VALUE + && n->sibling == NULL) + tree[off++] = 0xff; + } + assert (off <= tree_size); +} + +static void +build_radix_tree (void) +{ + size_t i, j, k, key_idx; + + for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) + if (generated_ranges[i].ok == INT_MAX) + { + if (generated_ranges[i].max_high - generated_ranges[i].high > 15UL) + break; + } + else if (generated_ranges[i].ok == (generated_ranges[i].high + - generated_ranges[i].low + 1)) + { + if (generated_ranges[i].max_high != generated_ranges[i].high) + break; + } + else + break; + if (i < ARRAY_SIZE (generated_ranges)) + fail ("uncovered generated range %s %lx %lx", + generated_ranges[i].prefix, generated_ranges[i].low, + generated_ranges[i].high); + /* Sort entries alphabetically, node_add relies on that. */ + qsort (entries, num_entries, sizeof (struct entry), entrycmp); + for (i = 1; i < num_entries; ++i) + if (i && strcmp (entries[i].name, entries[i - 1].name) == 0) + fail ("multiple entries for name %s", entries[i].name); + + for (i = 0; i < num_entries; ++i) + node_add (&root, entries[i].name, strlen (entries[i].name), + entries[i].codepoint); + + nodes = (struct node **) xmalloc (num_nodes * sizeof (struct node *)); + i = num_nodes; + num_nodes = 0; + append_nodes (root); + assert (num_nodes == i); + /* Sort node pointers by decreasing string length to handle substrings + right. */ + qsort (nodes, num_nodes, sizeof (struct node *), nodecmp); + if (nodes[0]->key_len >= 64) + /* We could actually encode even 64 and 65, as key_len 0 and 1 will + never appear in the multiple letter key encodings, so could subtract + 2. */ + fail ("can't encode key length %d >= 64, so need to split some radix " + "tree nodes to ensure length fits", nodes[0]->key_len); + + /* Verify a property charset.cc UAX44-LM2 matching relies on: + if - is at the end of key of some node, then all its siblings + start with alphanumeric characters. + Only 2 character names and 1 alias have - followed by space: + U+0F0A TIBETAN MARK BKA- SHOG YIG MGO + U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN + U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN + so the KA- in there will always be followed at least by SHOG + in the same node. + If this changes, charset.cc needs to change. */ + for (i = 0; i < num_nodes; ++i) + if (nodes[i]->key[nodes[i]->key_len - 1] == '-' + && nodes[i]->child) + { + struct node *n; + + for (n = nodes[i]->child; n; n = n->sibling) + if (n->key[0] == ' ') + fail ("node with key %.*s followed by node with key %.*s", + (int) nodes[i]->key_len, nodes[i]->key, + (int) n->key_len, n->key); + } + + /* This is expensive, O(num_nodes * num_nodes * nodes[0]->key_len), but + fortunately num_nodes is < 64K and key_len < 64. */ + key_idx = 0; + for (i = 0; i < num_nodes; ++i) + { + nodes[i]->key_idx = SIZE_MAX; + nodes[i]->in_dict = false; + if (nodes[i]->key_len > 1) + { + for (j = 0; j < i; ++j) + /* Can't rely on memmem unfortunately. */ + if (nodes[j]->in_dict) + { + for (k = 0; k <= nodes[j]->key_len - nodes[i]->key_len; ++k) + if (nodes[j]->key[k] == nodes[i]->key[0] + && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1, + nodes[i]->key_len - 1) == 0) + { + nodes[i]->key_idx = nodes[j]->key_idx + k; + j = i; + break; + } + if (j == i) + break; + for (; k < nodes[j]->key_len; ++k) + if (nodes[j]->key[k] == nodes[i]->key[0] + && memcmp (nodes[j]->key + k + 1, nodes[i]->key + 1, + nodes[j]->key_len - 1 - k) == 0) + { + size_t l; + + for (l = j + 1; l < i; ++l) + if (nodes[l]->in_dict) + break; + if (l < i + && memcmp (nodes[l]->key, + nodes[i]->key + (nodes[j]->key_len - k), + nodes[i]->key_len + - (nodes[j]->key_len - k)) == 0) + { + nodes[i]->key_idx = nodes[j]->key_idx + k; + j = i; + } + else + j = l - 1; + break; + } + } + if (nodes[i]->key_idx == SIZE_MAX) + { + nodes[i]->key_idx = key_idx; + nodes[i]->in_dict = true; + key_idx += nodes[i]->key_len; + } + } + } + if (key_idx >= 65536) + /* We only use 2 bytes for offsets into the dictionary. + If it grows more, there is e.g. a possibility to replace + most often seen words or substrings in the dictionary + with characters other than [A-Z0-9 -] (say LETTER occurs + in the dictionary almost 197 times and so by using a + instead of LETTER we could save (6 - 1) * 197 bytes, + with some on the side table mapping 'a' to "LETTER". */ + fail ("too large dictionary %ld", (long) key_idx); + dict_size = key_idx; + + size_nodes (root); + + dict = (char *) xmalloc (dict_size + 1); + for (i = 0; i < num_nodes; ++i) + if (nodes[i]->in_dict) + memcpy (dict + nodes[i]->key_idx, nodes[i]->key, nodes[i]->key_len); + dict[dict_size] = '\0'; + + tree = (unsigned char *) xmalloc (tree_size); + memset (tree, 0, tree_size); + write_nodes (root, 0); +} + +/* Print out the huge copyright notice. */ + +static void +write_copyright (void) +{ + static const char copyright[] = "\ +/* Unicode name to codepoint.\n\ + Copyright (C) 2005-2022 Free Software Foundation, Inc.\n\ +\n\ + This program is free software; you can redistribute it and/or modify it\n\ + under the terms of the GNU General Public License as published by the\n\ + Free Software Foundation; either version 3, or (at your option) any\n\ + later version.\n\ +\n\ + This program is distributed in the hope that it will be useful,\n\ + but WITHOUT ANY WARRANTY; without even the implied warranty of\n\ + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\ + GNU General Public License for more details.\n\ +\n\ + You should have received a copy of the GNU General Public License\n\ + along with this program; see the file COPYING3. If not see\n\ + .\n\ +\n\ +\n\ + Copyright (C) 1991-2021 Unicode, Inc. All rights reserved.\n\ + Distributed under the Terms of Use in\n\ + http://www.unicode.org/copyright.html.\n\ +\n\ + Permission is hereby granted, free of charge, to any person\n\ + obtaining a copy of the Unicode data files and any associated\n\ + documentation (the \"Data Files\") or Unicode software and any\n\ + associated documentation (the \"Software\") to deal in the Data Files\n\ + or Software without restriction, including without limitation the\n\ + rights to use, copy, modify, merge, publish, distribute, and/or\n\ + sell copies of the Data Files or Software, and to permit persons to\n\ + whom the Data Files or Software are furnished to do so, provided\n\ + that (a) the above copyright notice(s) and this permission notice\n\ + appear with all copies of the Data Files or Software, (b) both the\n\ + above copyright notice(s) and this permission notice appear in\n\ + associated documentation, and (c) there is clear notice in each\n\ + modified Data File or in the Software as well as in the\n\ + documentation associated with the Data File(s) or Software that the\n\ + data or software has been modified.\n\ +\n\ + THE DATA FILES AND SOFTWARE ARE PROVIDED \"AS IS\", WITHOUT WARRANTY\n\ + OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE\n\ + WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND\n\ + NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE\n\ + COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR\n\ + ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY\n\ + DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,\n\ + WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS\n\ + ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE\n\ + OF THE DATA FILES OR SOFTWARE.\n\ +\n\ + Except as contained in this notice, the name of a copyright holder\n\ + shall not be used in advertising or otherwise to promote the sale,\n\ + use or other dealings in these Data Files or Software without prior\n\ + written authorization of the copyright holder. */\n"; + + puts (copyright); +} + +static void +write_dict (void) +{ + size_t i; + + printf ("static const char uname2c_dict[%ld] =\n", (long) (dict_size + 1)); + for (i = 0; i < dict_size; i += 77) + printf ("\"%.77s\"%s\n", dict + i, i + 76 > dict_size ? ";" : ""); + puts (""); +} + +static void +write_tree (void) +{ + size_t i, j; + + printf ("static const unsigned char uname2c_tree[%ld] = {\n", + (long) tree_size); + for (i = 0, j = 0; i < tree_size; ++i) + { + printf ("%s0x%02x%s", j == 0 ? " " : "", tree[i], + i == tree_size - 1 ? " };\n\n" : j == 11 ? ",\n" : ", "); + if (j == 11) + j = 0; + else + ++j; + } +} + +static void +write_generated (void) +{ + size_t i, j; + + puts ("static const cppchar_t uname2c_pairs[] = {"); + for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) + { + if (i == 0) + ; + else if (generated_ranges[i - 1].idx != generated_ranges[i].idx) + puts (", 0,"); + else + puts (","); + printf (" 0x%lx, 0x%lx /* %s */", + generated_ranges[i].low, + generated_ranges[i].high, + generated_ranges[i].prefix); + } + puts (", 0 };\n"); + + puts ("static const unsigned char uname2c_generated[] = {"); + for (i = 0, j = -1; i < ARRAY_SIZE (generated_ranges); ++i) + { + if (i == 0 || generated_ranges[i - 1].idx != generated_ranges[i].idx) + printf ("%s %d /* %s */", i ? ",\n" : "", + ++j, generated_ranges[i].prefix); + j += 2; + } + puts (" };\n"); +} + +/* Main program. */ + +int +main (int argc, char **argv) +{ + size_t i; + + if (argc != 3) + fail ("too few arguments to makeradixtree"); + for (i = 0; i < ARRAY_SIZE (generated_ranges); ++i) + if (!generated_ranges[i].max_high) + generated_ranges[i].max_high = generated_ranges[i].high; + read_table (argv[1], false); + read_table (argv[2], true); + build_radix_tree (); + + write_copyright (); + write_dict (); + write_tree (); + write_generated (); + printf ("static const unsigned int uname2c_max_name_len = %ld;\n\n", max_entry_len); + return 0; +} --- gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-1.c.jj 2022-08-25 10:04:15.176363221 +0200 +++ gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-1.c 2022-08-25 10:04:15.176363221 +0200 @@ -0,0 +1,174 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do run } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=c++23" { target c++ } } */ + +#ifndef __cplusplus +#include +typedef __CHAR16_TYPE__ char16_t; +typedef __CHAR32_TYPE__ char32_t; +#endif + +#ifdef __cplusplus +#if U'\u0000' != U'\N{NULL}' \ + || U'\u0001' != U'\N{START OF HEADING}' \ + || U'\u0002' != U'\N{START OF TEXT}' \ + || U'\u0003' != U'\N{END OF TEXT}' \ + || U'\u0004' != U'\N{END OF TRANSMISSION}' \ + || U'\u0005' != U'\N{ENQUIRY}' \ + || U'\u0006' != U'\N{ACKNOWLEDGE}' \ + || U'\u0007' != U'\N{ALERT}' \ + || U'\u0008' != U'\N{BACKSPACE}' \ + || U'\u0009' != U'\N{CHARACTER TABULATION}' \ + || U'\u0009' != U'\N{HORIZONTAL TABULATION}' \ + || U'\u000A' != U'\N{LINE FEED}' \ + || U'\u000A' != U'\N{NEW LINE}' \ + || U'\u000A' != U'\N{END OF LINE}' \ + || U'\u000B' != U'\N{LINE TABULATION}' \ + || U'\u000B' != U'\N{VERTICAL TABULATION}' \ + || U'\u000C' != U'\N{FORM FEED}' \ + || U'\u000D' != U'\N{CARRIAGE RETURN}' \ + || U'\u000E' != U'\N{SHIFT OUT}' \ + || U'\u000E' != U'\N{LOCKING-SHIFT ONE}' \ + || U'\u000F' != U'\N{SHIFT IN}' \ + || U'\u000F' != U'\N{LOCKING-SHIFT ZERO}' \ + || U'\u0010' != U'\N{DATA LINK ESCAPE}' \ + || U'\u0011' != U'\N{DEVICE CONTROL ONE}' \ + || U'\u0012' != U'\N{DEVICE CONTROL TWO}' \ + || U'\u0013' != U'\N{DEVICE CONTROL THREE}' \ + || U'\u0014' != U'\N{DEVICE CONTROL FOUR}' \ + || U'\u0015' != U'\N{NEGATIVE ACKNOWLEDGE}' \ + || U'\u0016' != U'\N{SYNCHRONOUS IDLE}' \ + || U'\u0017' != U'\N{END OF TRANSMISSION BLOCK}' \ + || U'\u0018' != U'\N{CANCEL}' \ + || U'\u0019' != U'\N{END OF MEDIUM}' \ + || U'\u001A' != U'\N{SUBSTITUTE}' \ + || U'\u001B' != U'\N{ESCAPE}' \ + || U'\u001C' != U'\N{INFORMATION SEPARATOR FOUR}' \ + || U'\u001C' != U'\N{FILE SEPARATOR}' \ + || U'\u001D' != U'\N{INFORMATION SEPARATOR THREE}' \ + || U'\u001D' != U'\N{GROUP SEPARATOR}' \ + || U'\u001E' != U'\N{INFORMATION SEPARATOR TWO}' \ + || U'\u001E' != U'\N{RECORD SEPARATOR}' \ + || U'\u001F' != U'\N{INFORMATION SEPARATOR ONE}' \ + || U'\u001F' != U'\N{UNIT SEPARATOR}' \ + || U'\u007F' != U'\N{DELETE}' \ + || U'\u0082' != U'\N{BREAK PERMITTED HERE}' \ + || U'\u0083' != U'\N{NO BREAK HERE}' \ + || U'\u0084' != U'\N{INDEX}' \ + || U'\u0085' != U'\N{NEXT LINE}' \ + || U'\u0086' != U'\N{START OF SELECTED AREA}' \ + || U'\u0087' != U'\N{END OF SELECTED AREA}' \ + || U'\u0088' != U'\N{CHARACTER TABULATION SET}' \ + || U'\u0088' != U'\N{HORIZONTAL TABULATION SET}' \ + || U'\u0089' != U'\N{CHARACTER TABULATION WITH JUSTIFICATION}' \ + || U'\u0089' != U'\N{HORIZONTAL TABULATION WITH JUSTIFICATION}' \ + || U'\u008A' != U'\N{LINE TABULATION SET}' \ + || U'\u008A' != U'\N{VERTICAL TABULATION SET}' \ + || U'\u008B' != U'\N{PARTIAL LINE FORWARD}' \ + || U'\u008B' != U'\N{PARTIAL LINE DOWN}' \ + || U'\u008C' != U'\N{PARTIAL LINE BACKWARD}' \ + || U'\u008C' != U'\N{PARTIAL LINE UP}' \ + || U'\u008D' != U'\N{REVERSE LINE FEED}' \ + || U'\u008D' != U'\N{REVERSE INDEX}' \ + || U'\u008E' != U'\N{SINGLE SHIFT TWO}' \ + || U'\u008E' != U'\N{SINGLE-SHIFT-2}' \ + || U'\u008F' != U'\N{SINGLE SHIFT THREE}' \ + || U'\u008F' != U'\N{SINGLE-SHIFT-3}' \ + || U'\u0090' != U'\N{DEVICE CONTROL STRING}' \ + || U'\u0091' != U'\N{PRIVATE USE ONE}' \ + || U'\u0091' != U'\N{PRIVATE USE-1}' \ + || U'\u0092' != U'\N{PRIVATE USE TWO}' \ + || U'\u0092' != U'\N{PRIVATE USE-2}' \ + || U'\u0093' != U'\N{SET TRANSMIT STATE}' \ + || U'\u0094' != U'\N{CANCEL CHARACTER}' \ + || U'\u0095' != U'\N{MESSAGE WAITING}' \ + || U'\u0096' != U'\N{START OF GUARDED AREA}' \ + || U'\u0096' != U'\N{START OF PROTECTED AREA}' \ + || U'\u0097' != U'\N{END OF GUARDED AREA}' \ + || U'\u0097' != U'\N{END OF PROTECTED AREA}' \ + || U'\u0098' != U'\N{START OF STRING}' \ + || U'\u009A' != U'\N{SINGLE CHARACTER INTRODUCER}' \ + || U'\u009B' != U'\N{CONTROL SEQUENCE INTRODUCER}' \ + || U'\u009C' != U'\N{STRING TERMINATOR}' \ + || U'\u009D' != U'\N{OPERATING SYSTEM COMMAND}' \ + || U'\u009E' != U'\N{PRIVACY MESSAGE}' \ + || U'\u009F' != U'\N{APPLICATION PROGRAM COMMAND}' \ + || U'\u0020' != U'\N{SPACE}' \ + || U'\u0030' != U'\N{DIGIT ZERO}' \ + || U'\u0053' != U'\N{LATIN CAPITAL LETTER S}' +#error Bad +#endif +#endif +#if U'\U0001F402' != U'\N{OX}' \ + || U'\U0001FBA9' != U'\N{BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO MIDDLE RIGHT AND MIDDLE LEFT TO LOWER CENTRE}' \ + || U'\u01FD' != U'\N{LATIN SMALL LETTER AE WITH ACUTE}' \ + || U'\u2118' != U'\N{WEIERSTRASS ELLIPTIC FUNCTION}' \ + || U'\u2118' != U'\N{SCRIPT CAPITAL P}' \ + || U'\uFEFF' != U'\N{BYTE ORDER MARK}' \ + || U'\uFEFF' != U'\N{ZERO WIDTH NO-BREAK SPACE}' \ + || U'\u116C' != U'\N{HANGUL JUNGSEONG OE}' \ + || U'\u1180' != U'\N{HANGUL JUNGSEONG O-E}' \ + || U'\u0F60' != U'\N{TIBETAN LETTER -A}' \ + || U'\u0F68' != U'\N{TIBETAN LETTER A}' \ + || U'\u0F0A' != U'\N{TIBETAN MARK BKA- SHOG YIG MGO}' \ + || U'\u0FD0' != U'\N{TIBETAN MARK BKA- SHOG GI MGO RGYAN}' \ + || U'\u0FD0' != U'\N{TIBETAN MARK BSKA- SHOG GI MGO RGYAN}' \ + || U'\uFE18' != U'\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET}' \ + || U'\uFE18' != U'\N{PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET}' \ + || U'\uAC00' != U'\N{HANGUL SYLLABLE GA}' \ + || U'\uAC02' != U'\N{HANGUL SYLLABLE GAGG}' \ + || U'\uAD8D' != U'\N{HANGUL SYLLABLE GWEONJ}' \ + || U'\uAE4D' != U'\N{HANGUL SYLLABLE GGAG}' \ + || U'\uAE4E' != U'\N{HANGUL SYLLABLE GGAGG}' \ + || U'\uC544' != U'\N{HANGUL SYLLABLE A}' \ + || U'\uC55F' != U'\N{HANGUL SYLLABLE AH}' \ + || U'\uC560' != U'\N{HANGUL SYLLABLE AE}' \ + || U'\uD7A3' != U'\N{HANGUL SYLLABLE HIH}' \ + || U'\u3400' != U'\N{CJK UNIFIED IDEOGRAPH-3400}' \ + || U'\u4DBF' != U'\N{CJK UNIFIED IDEOGRAPH-4DBF}' \ + || U'\u4E00' != U'\N{CJK UNIFIED IDEOGRAPH-4E00}' \ + || U'\u9FFC' != U'\N{CJK UNIFIED IDEOGRAPH-9FFC}' \ + || U'\U00020000' != U'\N{CJK UNIFIED IDEOGRAPH-20000}' \ + || U'\U0002A6DD' != U'\N{CJK UNIFIED IDEOGRAPH-2A6DD}' \ + || U'\U00020700' != U'\N{CJK UNIFIED IDEOGRAPH-20700}' \ + || U'\U0002B734' != U'\N{CJK UNIFIED IDEOGRAPH-2B734}' \ + || U'\U0002B740' != U'\N{CJK UNIFIED IDEOGRAPH-2B740}' \ + || U'\U0002B81D' != U'\N{CJK UNIFIED IDEOGRAPH-2B81D}' \ + || U'\U0002B820' != U'\N{CJK UNIFIED IDEOGRAPH-2B820}' \ + || U'\U0002CEA1' != U'\N{CJK UNIFIED IDEOGRAPH-2CEA1}' \ + || U'\U0002CEB0' != U'\N{CJK UNIFIED IDEOGRAPH-2CEB0}' \ + || U'\U0002EBE0' != U'\N{CJK UNIFIED IDEOGRAPH-2EBE0}' \ + || U'\U00030000' != U'\N{CJK UNIFIED IDEOGRAPH-30000}' \ + || U'\U0003134A' != U'\N{CJK UNIFIED IDEOGRAPH-3134A}' \ + || U'\U00017000' != U'\N{TANGUT IDEOGRAPH-17000}' \ + || U'\U000187F7' != U'\N{TANGUT IDEOGRAPH-187F7}' \ + || U'\U00018D00' != U'\N{TANGUT IDEOGRAPH-18D00}' \ + || U'\U00018D08' != U'\N{TANGUT IDEOGRAPH-18D08}' \ + || U'\U00018B00' != U'\N{KHITAN SMALL SCRIPT CHARACTER-18B00}' \ + || U'\U00018CD5' != U'\N{KHITAN SMALL SCRIPT CHARACTER-18CD5}' \ + || U'\U0001B170' != U'\N{NUSHU CHARACTER-1B170}' \ + || U'\U0001B2FB' != U'\N{NUSHU CHARACTER-1B2FB}' \ + || U'\uF900' != U'\N{CJK COMPATIBILITY IDEOGRAPH-F900}' \ + || U'\uFA6D' != U'\N{CJK COMPATIBILITY IDEOGRAPH-FA6D}' \ + || U'\uFA70' != U'\N{CJK COMPATIBILITY IDEOGRAPH-FA70}' \ + || U'\uFAD9' != U'\N{CJK COMPATIBILITY IDEOGRAPH-FAD9}' \ + || U'\U0002F800' != U'\N{CJK COMPATIBILITY IDEOGRAPH-2F800}' \ + || U'\U0002FA1D' != U'\N{CJK COMPATIBILITY IDEOGRAPH-2FA1D}' +#error Bad +#endif + +const char32_t *a = U"\N{HEBREW LETTER KAF}\N{HEBREW LETTER FINAL NUN}"; +const char32_t *b = U"\N{OX}\N{BOX DRAWINGS LIGHT DIAGONAL UPPER CENTRE TO MIDDLE RIGHT AND MIDDLE LEFT TO LOWER CENTRE}"; + +#define b\N{LATIN SMALL LETTER O WITH ACUTE}x U'\U0001FBA9' + +int +main () +{ + if (a[0] != U'\u05DB' || a[1] != U'\U000005DF' || a[2] + || b[0] != U'\U0001F402' || b[1] != b\u{f3}x || b[2]) + __builtin_abort (); + return 0; +} --- gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-2.c.jj 2022-08-25 10:04:15.176363221 +0200 +++ gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-2.c 2022-08-25 10:04:15.176363221 +0200 @@ -0,0 +1,18 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=c++23" { target c++ } } */ + +int jalape\N{LATIN SMALL LETTER N WITH TILDE}o = 42; + +int +caf\N{LATIN SMALL LETTER E WITH ACUTE} (void) +{ + return jalape\u00F1o; +} + +int +test (void) +{ + return caf\u00e9 (); +} --- gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-3.c.jj 2022-08-25 10:04:15.176363221 +0200 +++ gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-3.c 2022-08-25 10:04:15.176363221 +0200 @@ -0,0 +1,22 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=c++23" { target c++ } } */ + +#ifndef __cplusplus +typedef __CHAR32_TYPE__ char32_t; +#endif + +const char32_t *a = U"\N{}"; /* { dg-error "empty named universal character escape sequence" } */ +const char32_t *b = U"\N{NU" "LL}"; /* { dg-error "'\\\\N\\{' not terminated with '\\}' after" } */ + /* { dg-error "is not a valid universal character" "" { target c } .-1 } */ +const char32_t *c = U"\N{ I've just made it up }"; /* { dg-error "'\\\\N\\{' not terminated with '\\}' after" } */ + /* { dg-error "is not a valid universal character" "" { target c } .-1 } */ +const char32_t *d = U"\N{_________ _______}"; /* { dg-error "is not a valid universal character" } */ +const char32_t *e = U"\N{O.X}"; /* { dg-error "'\\\\N\\{' not terminated with '\\}' after" } */ + /* { dg-error "is not a valid universal character" "" { target c } .-1 } */ +const char32_t *f = U"\N{.}"; /* { dg-error "'\\\\N\\{' not terminated with '\\}' after" } */ + /* { dg-error "is not a valid universal character" "" { target c } .-1 } */ +const char32_t *g = U"\N{BOM}"; /* { dg-error "is not a valid universal character" } */ +const char32_t *h = U"\N{ZWNBSP}"; /* { dg-error "is not a valid universal character" } */ --- gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-4.c.jj 2022-08-25 10:04:15.176363221 +0200 +++ gcc/testsuite/c-c++-common/cpp/named-universal-char-escape-4.c 2022-08-25 10:04:15.176363221 +0200 @@ -0,0 +1,60 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat" { target c } } */ +/* { dg-options "-std=c++23" { target c++ } } */ + +#ifndef __cplusplus +typedef __CHAR32_TYPE__ char32_t; +#endif + +const char32_t *a = U"\N{ZERO WIDTH NO BREAK SPACE}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{ZERO WIDTH NO-BREAK SPACE\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *b = U"\N{giraffe face}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{GIRAFFE FACE\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *c = U"\N{Giraffe Face}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{GIRAFFE FACE\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *d = U"\N{ GiRaFfE_fAcE__ ___}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{GIRAFFE FACE\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *e = U"\N{GIRAFFE}"; /* { dg-error "is not a valid universal character" } */ +const char32_t *f = U"\N{Hangul_Syllable_gAgg_}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL SYLLABLE GAGG\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *g = U"\N{HANGUL SYLLABLE gagg}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL SYLLABLE GAGG\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *h = U"\N{HANGULSYLLABLEGAGG}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL SYLLABLE GAGG\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *i = U"\N{HANGUL_SYLLABLE_GAGG}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL SYLLABLE GAGG\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *j = U"\N{HANGUL SYLLABLE }"; /* { dg-error "is not a valid universal character" } */ +const char32_t *k = U"\N{CJK-COMPATIBILITY-IDEOGRAPH-2F801}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{CJK COMPATIBILITY IDEOGRAPH-2F801\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *l = U"\N{CjK_COMPATIBILITY IDEOGRAPH 2f801}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{CJK COMPATIBILITY IDEOGRAPH-2F801\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *m = U"\N{CjK_COMPATIBILITY IDEOGRAPH 2f80}"; /* { dg-error "is not a valid universal character" } */ +const char32_t *n = U"\N{CJK COMPATIBILITY IDEOGRAPH-}"; /* { dg-error "is not a valid universal character" } */ +const char32_t *o = U"\N{CJK COMPATIBILITY IDEOGRAPH-X}"; /* { dg-error "is not a valid universal character" } */ +const char32_t *p = U"\N{Tibetan Letter A}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{TIBETAN LETTER A\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *q = U"\N{Tibetan LetterA}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{TIBETAN LETTER A\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *r = U"\N{Tibetan Letter-A}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{TIBETAN LETTER A\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *s = U"\N{Tibetan Letter -A}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{TIBETAN LETTER -A\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *t = U"\N{TibetanLetter -A}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{TIBETAN LETTER -A\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *u = U"\N{Hangul Jungseong oe}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL JUNGSEONG OE\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *v = U"\N{Hangul Jungseong o- e}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL JUNGSEONG O-E\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *w = U"\N{HangulJungseongo-e}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL JUNGSEONG O-E\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *x = U"\N{Hangul Jungseong oe __ }"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL JUNGSEONG OE\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *y = U"\N{Hangul Jungseong o- e __ }"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL JUNGSEONG O-E\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *z = U"\N{Hangul Jungseong o -e}"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL JUNGSEONG O-E\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *A = U"\N{Hangul Jungseong o -e __ }"; /* { dg-error "is not a valid universal character" } */ + /* { dg-message "did you mean \\\\N\\{HANGUL JUNGSEONG O-E\\}\\?" "" { target *-*-* } .-1 } */ +const char32_t *B = U"\N{O}"; /* { dg-error "is not a valid universal character" } */ --- gcc/testsuite/c-c++-common/Wbidi-chars-25.c.jj 2022-08-25 10:04:15.176363221 +0200 +++ gcc/testsuite/c-c++-common/Wbidi-chars-25.c 2022-08-25 10:04:15.176363221 +0200 @@ -0,0 +1,28 @@ +/* PR preprocessor/103026 */ +/* { dg-do compile } */ +/* { dg-options "-Wbidi-chars=ucn,unpaired" } */ +/* Test nesting of bidi chars in various contexts. */ + +void +g1 () +{ + const char *s1 = "a b c LRE\N{LEFT-TO-RIGHT EMBEDDING} 1 2 3 PDI\N{POP DIRECTIONAL ISOLATE} x y z"; +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */ + const char *s2 = "a b c RLE\N{RIGHT-TO-LEFT EMBEDDING} 1 2 3 PDI\N{POP DIRECTIONAL ISOLATE} x y z"; +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */ + const char *s3 = "a b c LRO\N{LEFT-TO-RIGHT OVERRIDE} 1 2 3 PDI\N{POP DIRECTIONAL ISOLATE} x y z"; +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */ + const char *s4 = "a b c RLO\N{RIGHT-TO-LEFT OVERRIDE} 1 2 3 PDI\N{POP DIRECTIONAL ISOLATE} x y z"; +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */ + const char *s5 = "a b c LRI\N{LEFT-TO-RIGHT ISOLATE} 1 2 3 PDF\N{POP DIRECTIONAL FORMATTING} x y z"; +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */ + const char *s6 = "a b c RLI\N{RIGHT-TO-LEFT ISOLATE} 1 2 3 PDF\N{POP DIRECTIONAL FORMATTING} x y z"; +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */ + const char *s7 = "a b c FSI\N{FIRST STRONG ISOLATE} 1 2 3 PDF\N{POP DIRECTIONAL FORMATTING} x y z"; +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */ +} + +int A\N{LEFT-TO-RIGHT EMBEDDING}B\N{POP DIRECTIONAL ISOLATE}C; +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */ +int a\N{RIGHT-TO-LEFT EMBEDDING}B\N{POP DIRECTIONAL ISOLATE}c; +/* { dg-warning "unpaired" "" { target *-*-* } .-1 } */ --- gcc/testsuite/gcc.dg/cpp/named-universal-char-escape-1.c.jj 2022-08-25 10:04:15.176363221 +0200 +++ gcc/testsuite/gcc.dg/cpp/named-universal-char-escape-1.c 2022-08-25 10:04:15.176363221 +0200 @@ -0,0 +1,8 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat -pedantic" } */ + +typedef __CHAR32_TYPE__ char32_t; + +const char32_t *a = U"\N{ETHIOPIC SYLLABLE SEE}"; /* { dg-warning "named universal character escapes are only valid in" } */ --- gcc/testsuite/gcc.dg/cpp/named-universal-char-escape-2.c.jj 2022-08-25 10:04:15.176363221 +0200 +++ gcc/testsuite/gcc.dg/cpp/named-universal-char-escape-2.c 2022-08-25 10:04:15.176363221 +0200 @@ -0,0 +1,8 @@ +/* P2071R2 - Named universal character escapes */ +/* { dg-do compile } */ +/* { dg-require-effective-target wchar } */ +/* { dg-options "-std=gnu99 -Wno-c++-compat -pedantic-errors" } */ + +typedef __CHAR32_TYPE__ char32_t; + +const char32_t *a = U"\N{ETHIOPIC SYLLABLE SEE}"; /* { dg-error "named universal character escapes are only valid in" } */ --- gcc/testsuite/g++.dg/cpp/named-universal-char-escape-1.C.jj 2022-08-25 10:04:15.176363221 +0200 +++ gcc/testsuite/g++.dg/cpp/named-universal-char-escape-1.C 2022-08-25 10:04:15.176363221 +0200 @@ -0,0 +1,6 @@ +// P2071R2 - Named universal character escapes +// { dg-do compile { target c++11 } } +// { dg-require-effective-target wchar } +// { dg-options "-pedantic" } + +const char32_t *a = U"\N{ETHIOPIC SYLLABLE SEE}"; // { dg-warning "named universal character escapes are only valid in" "" { target c++20_down } } --- gcc/testsuite/g++.dg/cpp/named-universal-char-escape-2.C.jj 2022-08-25 10:04:15.176363221 +0200 +++ gcc/testsuite/g++.dg/cpp/named-universal-char-escape-2.C 2022-08-25 10:04:15.176363221 +0200 @@ -0,0 +1,6 @@ +// P2071R2 - Named universal character escapes +// { dg-do compile { target c++11 } } +// { dg-require-effective-target wchar } +// { dg-options "-pedantic-errors" } + +const char32_t *a = U"\N{ETHIOPIC SYLLABLE SEE}"; // { dg-error "named universal character escapes are only valid in" "" { target c++20_down } }