diff --git a/contrib/unicode/from_glibc/unicode_utils.py b/contrib/unicode/from_glibc/unicode_utils.py new file mode 100644 index 00000000000..a9e94cce418 --- /dev/null +++ b/contrib/unicode/from_glibc/unicode_utils.py @@ -0,0 +1,527 @@ +# Utilities to generate Unicode data for glibc from upstream Unicode data. +# +# Copyright (C) 2014-2019 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +''' +This module contains utilities used by the scripts to generate +Unicode data for glibc from upstream Unicode data files. +''' + +import sys +import re + + +# Common locale header. +COMMENT_HEADER = """ +% This file is part of the GNU C Library and contains locale data. +% The Free Software Foundation does not claim any copyright interest +% in the locale data contained in this file. The foregoing does not +% affect the license of the GNU C Library as a whole. It does not +% exempt you from the conditions of the license if your use would +% otherwise be governed by that license. +""" + +# Dictionary holding the entire contents of the UnicodeData.txt file +# +# Contents of this dictionary look like this: +# +# {0: {'category': 'Cc', +# 'title': None, +# 'digit': '', +# 'name': '', +# 'bidi': 'BN', +# 'combining': '0', +# 'comment': '', +# 'oldname': 'NULL', +# 'decomposition': '', +# 'upper': None, +# 'mirrored': 'N', +# 'lower': None, +# 'decdigit': '', +# 'numeric': ''}, +# … +# } +UNICODE_ATTRIBUTES = {} + +# Dictionary holding the entire contents of the DerivedCoreProperties.txt file +# +# Contents of this dictionary look like this: +# +# {917504: ['Default_Ignorable_Code_Point'], +# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], +# … +# } +DERIVED_CORE_PROPERTIES = {} + +# Dictionary holding the entire contents of the EastAsianWidths.txt file +# +# Contents of this dictionary look like this: +# +# {0: 'N', … , 45430: 'W', …} +EAST_ASIAN_WIDTHS = {} + +def fill_attribute(code_point, fields): + '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. + + One entry in the UNICODE_ATTRIBUTES dictionary represents one line + in the UnicodeData.txt file. + + ''' + UNICODE_ATTRIBUTES[code_point] = { + 'name': fields[1], # Character name + 'category': fields[2], # General category + 'combining': fields[3], # Canonical combining classes + 'bidi': fields[4], # Bidirectional category + 'decomposition': fields[5], # Character decomposition mapping + 'decdigit': fields[6], # Decimal digit value + 'digit': fields[7], # Digit value + 'numeric': fields[8], # Numeric value + 'mirrored': fields[9], # mirrored + 'oldname': fields[10], # Old Unicode 1.0 name + 'comment': fields[11], # comment + # Uppercase mapping + 'upper': int(fields[12], 16) if fields[12] else None, + # Lowercase mapping + 'lower': int(fields[13], 16) if fields[13] else None, + # Titlecase mapping + 'title': int(fields[14], 16) if fields[14] else None, + } + +def fill_attributes(filename): + '''Stores the entire contents of the UnicodeData.txt file + in the UNICODE_ATTRIBUTES dictionary. + + A typical line for a single code point in UnicodeData.txt looks + like this: + + 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; + + Code point ranges are indicated by pairs of lines like this: + + 4E00;;Lo;0;L;;;;;N;;;;; + 9FCC;;Lo;0;L;;;;;N;;;;; + ''' + with open(filename, mode='r') as unicode_data_file: + fields_start = [] + for line in unicode_data_file: + fields = line.strip().split(';') + if len(fields) != 15: + sys.stderr.write( + 'short line in file "%(f)s": %(l)s\n' %{ + 'f': filename, 'l': line}) + exit(1) + if fields[2] == 'Cs': + # Surrogates are UTF-16 artefacts, + # not real characters. Ignore them. + fields_start = [] + continue + if fields[1].endswith(', First>'): + fields_start = fields + fields_start[1] = fields_start[1].split(',')[0][1:] + continue + if fields[1].endswith(', Last>'): + fields[1] = fields[1].split(',')[0][1:] + if fields[1:] != fields_start[1:]: + sys.stderr.write( + 'broken code point range in file "%(f)s": %(l)s\n' %{ + 'f': filename, 'l': line}) + exit(1) + for code_point in range( + int(fields_start[0], 16), + int(fields[0], 16)+1): + fill_attribute(code_point, fields) + fields_start = [] + continue + fill_attribute(int(fields[0], 16), fields) + fields_start = [] + +def fill_derived_core_properties(filename): + '''Stores the entire contents of the DerivedCoreProperties.txt file + in the DERIVED_CORE_PROPERTIES dictionary. + + Lines in DerivedCoreProperties.txt are either a code point range like + this: + + 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z + + or a single code point like this: + + 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR + + ''' + with open(filename, mode='r') as derived_core_properties_file: + for line in derived_core_properties_file: + match = re.match( + r'^(?P[0-9A-F]{4,6})' + + r'(?:\.\.(?P[0-9A-F]{4,6}))?' + + r'\s*;\s*(?P[a-zA-Z_]+)', + line) + if not match: + continue + start = match.group('codepoint1') + end = match.group('codepoint2') + if not end: + end = start + for code_point in range(int(start, 16), int(end, 16)+1): + prop = match.group('property') + if code_point in DERIVED_CORE_PROPERTIES: + DERIVED_CORE_PROPERTIES[code_point].append(prop) + else: + DERIVED_CORE_PROPERTIES[code_point] = [prop] + +def fill_east_asian_widths(filename): + '''Stores the entire contents of the EastAsianWidths.txt file + in the EAST_ASIAN_WIDTHS dictionary. + + Lines in EastAsianWidths.txt are either a code point range like + this: + + 9FCD..9FFF;W # Cn [51] .. + + or a single code point like this: + + A015;W # Lm YI SYLLABLE WU + ''' + with open(filename, mode='r') as east_asian_widths_file: + for line in east_asian_widths_file: + match = re.match( + r'^(?P[0-9A-F]{4,6})' + +r'(?:\.\.(?P[0-9A-F]{4,6}))?' + +r'\s*;\s*(?P[a-zA-Z]+)', + line) + if not match: + continue + start = match.group('codepoint1') + end = match.group('codepoint2') + if not end: + end = start + for code_point in range(int(start, 16), int(end, 16)+1): + EAST_ASIAN_WIDTHS[code_point] = match.group('property') + +def to_upper(code_point): + '''Returns the code point of the uppercase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['upper']): + return UNICODE_ATTRIBUTES[code_point]['upper'] + else: + return code_point + +def to_lower(code_point): + '''Returns the code point of the lowercase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['lower']): + return UNICODE_ATTRIBUTES[code_point]['lower'] + else: + return code_point + +def to_upper_turkish(code_point): + '''Returns the code point of the Turkish uppercase version + of the given code point''' + if code_point == 0x0069: + return 0x0130 + return to_upper(code_point) + +def to_lower_turkish(code_point): + '''Returns the code point of the Turkish lowercase version + of the given code point''' + if code_point == 0x0049: + return 0x0131 + return to_lower(code_point) + +def to_title(code_point): + '''Returns the code point of the titlecase version + of the given code point''' + if (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['title']): + return UNICODE_ATTRIBUTES[code_point]['title'] + else: + return code_point + +def is_upper(code_point): + '''Checks whether the character with this code point is uppercase''' + return (to_lower(code_point) != code_point + or (code_point in DERIVED_CORE_PROPERTIES + and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) + +def is_lower(code_point): + '''Checks whether the character with this code point is lowercase''' + # Some characters are defined as “Lowercase” in + # DerivedCoreProperties.txt but do not have a mapping to upper + # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is + # one of these. + return (to_upper(code_point) != code_point + # is lowercase, but without simple to_upper mapping. + or code_point == 0x00DF + or (code_point in DERIVED_CORE_PROPERTIES + and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) + +def is_alpha(code_point): + '''Checks whether the character with this code point is alphabetic''' + return ((code_point in DERIVED_CORE_PROPERTIES + and + 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) + or + # Consider all the non-ASCII digits as alphabetic. + # ISO C 99 forbids us to have them in category “digit”, + # but we want iswalnum to return true on them. + (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' + and not (code_point >= 0x0030 and code_point <= 0x0039))) + +def is_digit(code_point): + '''Checks whether the character with this code point is a digit''' + if False: + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') + # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without + # a zero. Must add <0> in front of them by hand. + else: + # SUSV2 gives us some freedom for the "digit" category, but ISO C 99 + # takes it away: + # 7.25.2.1.5: + # The iswdigit function tests for any wide character that + # corresponds to a decimal-digit character (as defined in 5.2.1). + # 5.2.1: + # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 + return (code_point >= 0x0030 and code_point <= 0x0039) + +def is_outdigit(code_point): + '''Checks whether the character with this code point is outdigit''' + return (code_point >= 0x0030 and code_point <= 0x0039) + +def is_blank(code_point): + '''Checks whether the character with this code point is blank''' + return (code_point == 0x0009 # '\t' + # Category Zs without mention of '' + or (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' + and '' not in + UNICODE_ATTRIBUTES[code_point]['decomposition'])) + +def is_space(code_point): + '''Checks whether the character with this code point is a space''' + # Don’t make U+00A0 a space. Non-breaking space means that all programs + # should treat it like a punctuation character, not like a space. + return (code_point == 0x0020 # ' ' + or code_point == 0x000C # '\f' + or code_point == 0x000A # '\n' + or code_point == 0x000D # '\r' + or code_point == 0x0009 # '\t' + or code_point == 0x000B # '\v' + # Categories Zl, Zp, and Zs without mention of "" + or (UNICODE_ATTRIBUTES[code_point]['name'] + and + (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] + or + (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] + and + '' not in + UNICODE_ATTRIBUTES[code_point]['decomposition'])))) + +def is_cntrl(code_point): + '''Checks whether the character with this code point is + a control character''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and (UNICODE_ATTRIBUTES[code_point]['name'] == '' + or + UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) + +def is_xdigit(code_point): + '''Checks whether the character with this code point is + a hexadecimal digit''' + if False: + return (is_digit(code_point) + or (code_point >= 0x0041 and code_point <= 0x0046) + or (code_point >= 0x0061 and code_point <= 0x0066)) + else: + # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 + # takes it away: + # 7.25.2.1.12: + # The iswxdigit function tests for any wide character that + # corresponds to a hexadecimal-digit character (as defined + # in 6.4.4.1). + # 6.4.4.1: + # hexadecimal-digit: one of + # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F + return ((code_point >= 0x0030 and code_point <= 0x0039) + or (code_point >= 0x0041 and code_point <= 0x0046) + or (code_point >= 0x0061 and code_point <= 0x0066)) + +def is_graph(code_point): + '''Checks whether the character with this code point is + a graphical character''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['name'] != '' + and not is_space(code_point)) + +def is_print(code_point): + '''Checks whether the character with this code point is printable''' + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['name'] != '' + and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) + +def is_punct(code_point): + '''Checks whether the character with this code point is punctuation''' + if False: + return (UNICODE_ATTRIBUTES[code_point]['name'] + and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) + else: + # The traditional POSIX definition of punctuation is every graphic, + # non-alphanumeric character. + return (is_graph(code_point) + and not is_alpha(code_point) + and not is_digit(code_point)) + +def is_combining(code_point): + '''Checks whether the character with this code point is + a combining character''' + # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt + # file. In 3.0.1 it was identical to the union of the general categories + # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the + # PropList.txt file, so we take the latter definition. + return (UNICODE_ATTRIBUTES[code_point]['name'] + and + UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) + +def is_combining_level3(code_point): + '''Checks whether the character with this code point is + a combining level3 character''' + return (is_combining(code_point) + and + int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) + +def ucs_symbol(code_point): + '''Return the UCS symbol string for a Unicode character.''' + if code_point < 0x10000: + return ''.format(code_point) + else: + return ''.format(code_point) + +def ucs_symbol_range(code_point_low, code_point_high): + '''Returns a string UCS symbol string for a code point range. + + Example: + + .. + ''' + return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) + +def verifications(): + '''Tests whether the is_* functions observe the known restrictions''' + for code_point in sorted(UNICODE_ATTRIBUTES): + # toupper restriction: "Only characters specified for the keywords + # lower and upper shall be specified. + if (to_upper(code_point) != code_point + and not (is_lower(code_point) or is_upper(code_point))): + sys.stderr.write( + ('%(sym)s is not upper|lower ' + + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ + 'sym': ucs_symbol(code_point), + 'c': code_point, + 'uc': to_upper(code_point)}) + # tolower restriction: "Only characters specified for the keywords + # lower and upper shall be specified. + if (to_lower(code_point) != code_point + and not (is_lower(code_point) or is_upper(code_point))): + sys.stderr.write( + ('%(sym)s is not upper|lower ' + + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ + 'sym': ucs_symbol(code_point), + 'c': code_point, + 'uc': to_lower(code_point)}) + # alpha restriction: "Characters classified as either upper or lower + # shall automatically belong to this class. + if ((is_lower(code_point) or is_upper(code_point)) + and not is_alpha(code_point)): + sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ + 'sym': ucs_symbol(code_point)}) + # alpha restriction: “No character specified for the keywords cntrl, + # digit, punct or space shall be specified.” + if (is_alpha(code_point) and is_cntrl(code_point)): + sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is alpha and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_punct(code_point)): + sys.stderr.write('%(sym)s is alpha and punct\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_alpha(code_point) and is_space(code_point)): + sys.stderr.write('%(sym)s is alpha and space\n' %{ + 'sym': ucs_symbol(code_point)}) + # space restriction: “No character specified for the keywords upper, + # lower, alpha, digit, graph or xdigit shall be specified.” + # upper, lower, alpha already checked above. + if (is_space(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is space and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_space(code_point) and is_graph(code_point)): + sys.stderr.write('%(sym)s is space and graph\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_space(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is space and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + # cntrl restriction: “No character specified for the keywords upper, + # lower, alpha, digit, punct, graph, print or xdigit shall be + # specified.” upper, lower, alpha already checked above. + if (is_cntrl(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is cntrl and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_punct(code_point)): + sys.stderr.write('%(sym)s is cntrl and punct\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_graph(code_point)): + sys.stderr.write('%(sym)s is cntrl and graph\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_print(code_point)): + sys.stderr.write('%(sym)s is cntrl and print\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_cntrl(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + # punct restriction: “No character specified for the keywords upper, + # lower, alpha, digit, cntrl, xdigit or as the character shall + # be specified.” upper, lower, alpha, cntrl already checked above. + if (is_punct(code_point) and is_digit(code_point)): + sys.stderr.write('%(sym)s is punct and digit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_punct(code_point) and is_xdigit(code_point)): + sys.stderr.write('%(sym)s is punct and xdigit\n' %{ + 'sym': ucs_symbol(code_point)}) + if (is_punct(code_point) and code_point == 0x0020): + sys.stderr.write('%(sym)s is punct\n' %{ + 'sym': ucs_symbol(code_point)}) + # graph restriction: “No character specified for the keyword cntrl + # shall be specified.” Already checked above. + + # print restriction: “No character specified for the keyword cntrl + # shall be specified.” Already checked above. + + # graph - print relation: differ only in the character. + # How is this possible if there are more than one space character?! + # I think susv2/xbd/locale.html should speak of “space characters”, + # not “space character”. + if (is_print(code_point) + and not (is_graph(code_point) or is_space(code_point))): + sys.stderr.write('%(sym)s is print but not graph|\n' %{ + 'sym': unicode_utils.ucs_symbol(code_point)}) + if (not is_print(code_point) + and (is_graph(code_point) or code_point == 0x0020)): + sys.stderr.write('%(sym)s is graph| but not print\n' %{ + 'sym': unicode_utils.ucs_symbol(code_point)}) diff --git a/contrib/unicode/from_glibc/utf8_gen.py b/contrib/unicode/from_glibc/utf8_gen.py new file mode 100755 index 00000000000..0e5583cd259 --- /dev/null +++ b/contrib/unicode/from_glibc/utf8_gen.py @@ -0,0 +1,364 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +# Copyright (C) 2014-2019 Free Software Foundation, Inc. +# This file is part of the GNU C Library. +# +# The GNU C Library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# The GNU C Library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with the GNU C Library; if not, see +# . + +'''glibc/localedata/charmaps/UTF-8 file generator script + +This script generates a glibc/localedata/charmaps/UTF-8 file +from Unicode data. + +Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt + +It will output UTF-8 file +''' + +import argparse +import sys +import re +import unicode_utils + +# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, +# sections 3.11 and 4.4. + +JAMO_INITIAL_SHORT_NAME = ( + 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ', + 'C', 'K', 'T', 'P', 'H' +) + +JAMO_MEDIAL_SHORT_NAME = ( + 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE', + 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I' +) + +JAMO_FINAL_SHORT_NAME = ( + '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS', + 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T', + 'P', 'H' +) + +def process_range(start, end, outfile, name): + '''Writes a range of code points into the CHARMAP section of the + output file + + ''' + if 'Hangul Syllable' in name: + # from glibc/localedata/ChangeLog: + # + # 2000-09-24 Bruno Haible + # * charmaps/UTF-8: Expand and ranges, + # so they become printable and carry a width. Comment out surrogate + # ranges. Add a WIDTH table + # + # So we expand the Hangul Syllables here: + for i in range(int(start, 16), int(end, 16)+1 ): + index2, index3 = divmod(i - 0xaC00, 28) + index1, index2 = divmod(index2, 21) + hangul_syllable_name = 'HANGUL SYLLABLE ' \ + + JAMO_INITIAL_SHORT_NAME[index1] \ + + JAMO_MEDIAL_SHORT_NAME[index2] \ + + JAMO_FINAL_SHORT_NAME[index3] + outfile.write('{:<11s} {:<12s} {:s}\n'.format( + unicode_utils.ucs_symbol(i), convert_to_hex(i), + hangul_syllable_name)) + return + # UnicodeData.txt file has contains code point ranges like this: + # + # 3400;;Lo;0;L;;;;;N;;;;; + # 4DB5;;Lo;0;L;;;;;N;;;;; + # + # The glibc UTF-8 file splits ranges like these into shorter + # ranges of 64 code points each: + # + # .. /xe3/x90/x80 + # … + # .. /xe4/xb6/x80 + for i in range(int(start, 16), int(end, 16), 64 ): + if i > (int(end, 16)-64): + outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( + unicode_utils.ucs_symbol(i), + unicode_utils.ucs_symbol(int(end,16)), + convert_to_hex(i), + name)) + break + outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( + unicode_utils.ucs_symbol(i), + unicode_utils.ucs_symbol(i+63), + convert_to_hex(i), + name)) + +def process_charmap(flines, outfile): + '''This function takes an array which contains *all* lines of + of UnicodeData.txt and write lines to outfile as used in the + + CHARMAP + … + END CHARMAP + + section of the UTF-8 file in glibc/localedata/charmaps/UTF-8. + + Samples for input lines: + + 0010;;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;; + 3400;;Lo;0;L;;;;;N;;;;; + 4DB5;;Lo;0;L;;;;;N;;;;; + D800;;Cs;0;L;;;;;N;;;;; + DB7F;;Cs;0;L;;;;;N;;;;; + 100000;;Co;0;L;;;;;N;;;;; + 10FFFD;;Co;0;L;;;;;N;;;;; + + Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name): + + /x10 DATA LINK ESCAPE + .. /xe3/x90/x80 + % /xed/xa0/x80 + % /xed/xad/xbf + .. /xf4/x8f/xbf/x80 + + ''' + fields_start = [] + for line in flines: + fields = line.split(";") + # Some characters have “” as their name. We try to + # use the “Unicode 1.0 Name” (10th field in + # UnicodeData.txt) for them. + # + # The Characters U+0080, U+0081, U+0084 and U+0099 have + # “” as their name but do not even have aa + # ”Unicode 1.0 Name”. We could write code to take their + # alternate names from NameAliases.txt. + if fields[1] == "" and fields[10]: + fields[1] = fields[10] + # Handling code point ranges like: + # + # 3400;;Lo;0;L;;;;;N;;;;; + # 4DB5;;Lo;0;L;;;;;N;;;;; + if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]: + fields_start = fields + continue + if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]: + process_range(fields_start[0], fields[0], + outfile, fields[1][:-7]+'>') + fields_start = [] + continue + fields_start = [] + if 'Surrogate,' in fields[1]: + # Comment out the surrogates in the UTF-8 file. + # One could of course skip them completely but + # the original UTF-8 file in glibc had them as + # comments, so we keep these comment lines. + outfile.write('%') + outfile.write('{:<11s} {:<12s} {:s}\n'.format( + unicode_utils.ucs_symbol(int(fields[0], 16)), + convert_to_hex(int(fields[0], 16)), + fields[1])) + +def convert_to_hex(code_point): + '''Converts a code point to a hexadecimal UTF-8 representation + like /x**/x**/x**.''' + # Getting UTF8 of Unicode characters. + # In Python3, .encode('UTF-8') does not work for + # surrogates. Therefore, we use this conversion table + surrogates = { + 0xD800: '/xed/xa0/x80', + 0xDB7F: '/xed/xad/xbf', + 0xDB80: '/xed/xae/x80', + 0xDBFF: '/xed/xaf/xbf', + 0xDC00: '/xed/xb0/x80', + 0xDFFF: '/xed/xbf/xbf', + } + if code_point in surrogates: + return surrogates[code_point] + return ''.join([ + '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8') + ]) + +def write_header_charmap(outfile): + '''Write the header on top of the CHARMAP section to the output file''' + outfile.write(" UTF-8\n") + outfile.write(" %\n") + outfile.write(" /\n") + outfile.write(" 1\n") + outfile.write(" 6\n\n") + outfile.write("% CHARMAP generated using utf8_gen.py\n") + outfile.write("% alias ISO-10646/UTF-8\n") + outfile.write("CHARMAP\n") + +def write_header_width(outfile, unicode_version): + '''Writes the header on top of the WIDTH section to the output file''' + outfile.write('% Character width according to Unicode ' + + '{:s}.\n'.format(unicode_version)) + outfile.write('% - Default width is 1.\n') + outfile.write('% - Double-width characters have width 2; generated from\n') + outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n') + outfile.write('% - Non-spacing characters have width 0; ' + + 'generated from PropList.txt or\n') + outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' ' + + 'UnicodeData.txt"\n') + outfile.write('% - Format control characters have width 0; ' + + 'generated from\n') + outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n") +# Not needed covered by Cf +# outfile.write("% - Zero width characters have width 0; generated from\n") +# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n") + outfile.write("WIDTH\n") + +def process_width(outfile, ulines, elines, plines): + '''ulines are lines from UnicodeData.txt, elines are lines from + EastAsianWidth.txt containing characters with width “W” or “F”, + plines are lines from PropList.txt which contain characters + with the property “Prepended_Concatenation_Mark”. + + ''' + width_dict = {} + for line in elines: + fields = line.split(";") + if not '..' in fields[0]: + code_points = (fields[0], fields[0]) + else: + code_points = fields[0].split("..") + for key in range(int(code_points[0], 16), + int(code_points[1], 16)+1): + width_dict[key] = 2 + + for line in ulines: + fields = line.split(";") + if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"): + width_dict[int(fields[0], 16)] = 0 + + for line in plines: + # Characters with the property “Prepended_Concatenation_Mark” + # should have the width 1: + fields = line.split(";") + if not '..' in fields[0]: + code_points = (fields[0], fields[0]) + else: + code_points = fields[0].split("..") + for key in range(int(code_points[0], 16), + int(code_points[1], 16)+1): + del width_dict[key] # default width is 1 + + # handle special cases for compatibility + for key in list((0x00AD,)): + # https://www.cs.tut.fi/~jkorpela/shy.html + if key in width_dict: + del width_dict[key] # default width is 1 + for key in list(range(0x1160, 0x1200)): + width_dict[key] = 0 + for key in list(range(0x3248, 0x3250)): + # These are “A” which means we can decide whether to treat them + # as “W” or “N” based on context: + # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html + # For us, “W” seems better. + width_dict[key] = 2 + for key in list(range(0x4DC0, 0x4E00)): + width_dict[key] = 2 + + same_width_lists = [] + current_width_list = [] + for key in sorted(width_dict): + if not current_width_list: + current_width_list = [key] + elif (key == current_width_list[-1] + 1 + and width_dict[key] == width_dict[current_width_list[0]]): + current_width_list.append(key) + else: + same_width_lists.append(current_width_list) + current_width_list = [key] + if current_width_list: + same_width_lists.append(current_width_list) + + for same_width_list in same_width_lists: + if len(same_width_list) == 1: + outfile.write('{:s}\t{:d}\n'.format( + unicode_utils.ucs_symbol(same_width_list[0]), + width_dict[same_width_list[0]])) + else: + outfile.write('{:s}...{:s}\t{:d}\n'.format( + unicode_utils.ucs_symbol(same_width_list[0]), + unicode_utils.ucs_symbol(same_width_list[-1]), + width_dict[same_width_list[0]])) + +if __name__ == "__main__": + PARSER = argparse.ArgumentParser( + description=''' + Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt. + ''') + PARSER.add_argument( + '-u', '--unicode_data_file', + nargs='?', + type=str, + default='UnicodeData.txt', + help=('The UnicodeData.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-e', '--east_asian_with_file', + nargs='?', + type=str, + default='EastAsianWidth.txt', + help=('The EastAsianWidth.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '-p', '--prop_list_file', + nargs='?', + type=str, + default='PropList.txt', + help=('The PropList.txt file to read, ' + + 'default: %(default)s')) + PARSER.add_argument( + '--unicode_version', + nargs='?', + required=True, + type=str, + help='The Unicode version of the input files used.') + ARGS = PARSER.parse_args() + + with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE: + UNICODE_DATA_LINES = UNIDATA_FILE.readlines() + with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE: + EAST_ASIAN_WIDTH_LINES = [] + for LINE in EAST_ASIAN_WIDTH_FILE: + # If characters from EastAasianWidth.txt which are from + # from reserved ranges (i.e. not yet assigned code points) + # are added to the WIDTH section of the UTF-8 file, then + # “make check” produces “Unknown Character” errors for + # these code points because such unassigned code points + # are not in the CHARMAP section of the UTF-8 file. + # + # Therefore, we skip all reserved code points when reading + # the EastAsianWidth.txt file. + if re.match(r'.*\.\..*', LINE): + continue + if re.match(r'^[^;]*;[WF]', LINE): + EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) + with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE: + PROP_LIST_LINES = [] + for LINE in PROP_LIST_FILE: + if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE): + PROP_LIST_LINES.append(LINE.strip()) + with open('UTF-8', mode='w') as OUTFILE: + # Processing UnicodeData.txt and write CHARMAP to UTF-8 file + write_header_charmap(OUTFILE) + process_charmap(UNICODE_DATA_LINES, OUTFILE) + OUTFILE.write("END CHARMAP\n\n") + # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file + write_header_width(OUTFILE, ARGS.unicode_version) + process_width(OUTFILE, + UNICODE_DATA_LINES, + EAST_ASIAN_WIDTH_LINES, + PROP_LIST_LINES) + OUTFILE.write("END WIDTH\n") diff --git a/contrib/unicode/gen_wcwidth.py b/contrib/unicode/gen_wcwidth.py new file mode 100755 index 00000000000..02b28bcedcf --- /dev/null +++ b/contrib/unicode/gen_wcwidth.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +# +# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py. +# +# This file is part of GCC. +# +# GCC is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free +# Software Foundation; either version 3, or (at your option) any later +# version. +# +# GCC is distributed in the hope that it will be useful, but WITHOUT ANY +# WARRANTY; without even the implied warranty of MERCHANTABILITY or +# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# You should have received a copy of the GNU General Public License +# along with GCC; see the file COPYING3. If not see +# . */ + +import sys +import os + +if len(sys.argv) != 2: + print("usage: %s ", file=sys.stderr) + sys.exit(1) +unicode_version = sys.argv[1] + +# Parse a codepoint in the format output by glibc tools. +def parse_ucn(s): + if not (s.startswith("")): + raise ValueError + return int(s[2:-1], base=16) + +# Process a line of width output from utf_gen.py and update global array. +widths = [1] * (1 + 0x10FFFF) +def process_width(line): + # Example lines: + # 0 + # ... 0 + + s = line.split() + width = int(s[1]) + r = s[0].split("...") + if len(r) == 1: + begin = parse_ucn(r[0]) + end = begin + 1 + elif len(r) == 2: + begin = parse_ucn(r[0]) + end = parse_ucn(r[1]) + 1 + else: + raise ValueError + widths[begin:end] = [width] * (end - begin) + +# To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a +# file named UTF-8, which is not configurable. Then we parse this into the form +# we want it. +os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version) +processing = False +for line in open("UTF-8", "r"): + if processing: + if line == "END WIDTH\n": + processing = False + else: + try: + process_width(line) + except (ValueError, IndexError): + print(e, "warning: ignored unexpected line: %s" % line, + file=sys.stderr, end="") + elif line == "WIDTH\n": + processing = True + +# All bytes < 256 we treat as width 1. +widths[0:255] = [1] * 255 + +# Condense the list to contiguous ranges. +cur_range = [-1, 1] +all_ranges = [] +for i, width in enumerate(widths): + if width == cur_range[1]: + cur_range[0] = i + else: + all_ranges.append(cur_range) + cur_range = [i, width] + +# Output the arrays for generated_cpp_wcwidth.h +print("/* Generated by contrib/unicode/gen_wcwidth.py,", + "with the help of glibc's") +print(" utf8_gen.py, using version %s" % unicode_version, + "of the Unicode standard. */") +print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="") +for i, r in enumerate(all_ranges): + if i % 8: + print(" ", end="") + else: + print("\n ", end="") + print("0x%x," % (r[0]), end="") +print("\n};\n") +print("static const unsigned char wcwidth_widths[] = {", end="") +for i, r in enumerate(all_ranges): + if i % 24: + print(" ", end="") + else: + print("\n ", end="") + print("%d," % r[1], end="") +print("\n};") diff --git a/gcc/diagnostic-show-locus.c b/gcc/diagnostic-show-locus.c index 4d563dda8f4..7a5bd36d962 100644 --- a/gcc/diagnostic-show-locus.c +++ b/gcc/diagnostic-show-locus.c @@ -30,6 +30,7 @@ along with GCC; see the file COPYING3. If not see #include "gcc-rich-location.h" #include "selftest.h" #include "selftest-diagnostic.h" +#include "cpplib.h" #ifdef HAVE_TERMIOS_H # include @@ -112,7 +113,29 @@ class colorizer const char *m_stop_color; }; -/* A point within a layout_range; similar to an expanded_location, +/* In order to handle multibyte sources properly, all of this logic needs to be + aware of the distinction between the number of bytes and the number of + display columns occupied by a character. One or the other is more useful + depending on the context. For instance, in order to output the caret at the + correct location, we need to count display columns; in order to colorize a + source line, we need to count the bytes. All locations are provided to us + as byte counts. We augment these with the display column so that it can be + used when need. This is not the most efficient way to do things since it + requires looping over the whole line each time, but it should be fine for + the purpose of outputting diagnostics. */ + +class exploc_with_display_col : public expanded_location +{ + public: + exploc_with_display_col (const expanded_location &exploc) + : expanded_location (exploc), + m_display_col (location_compute_display_column (exploc)) {} + + int m_display_col; +}; + + +/* A point within a layout_range; similar to an exploc_with_display_col, but after filtering on file. */ class layout_point @@ -120,10 +143,17 @@ class layout_point public: layout_point (const expanded_location &exploc) : m_line (exploc.line), - m_column (exploc.column) {} + m_column (exploc.column), + m_display_col (location_compute_display_column (exploc)) {} + + int get_col (bool use_display) const + { + return use_display ? m_display_col : m_column; + } linenum_type m_line; int m_column; + int m_display_col; }; /* A class for use by "class layout" below: a filtered location_range. */ @@ -138,7 +168,7 @@ class layout_range unsigned original_idx, const range_label *label); - bool contains_point (linenum_type row, int column) const; + bool contains_point (linenum_type row, int column, bool use_display) const; bool intersects_line_p (linenum_type row) const; layout_point m_start; @@ -157,6 +187,17 @@ struct line_bounds { int m_first_non_ws; int m_last_non_ws; + + void convert_to_display_cols (char_span line) + { + m_first_non_ws = cpp_byte_column_to_display_column (line.get_buffer (), + line.length (), + m_first_non_ws); + + m_last_non_ws = cpp_byte_column_to_display_column (line.get_buffer (), + line.length (), + m_last_non_ws); + } }; /* A range of contiguous source lines within a layout (e.g. "lines 5-10" @@ -284,6 +325,7 @@ class layout get_state_at_point (/* Inputs. */ linenum_type row, int column, int first_non_ws, int last_non_ws, + bool use_display, /* Outputs. */ point_state *out_state); @@ -298,7 +340,7 @@ class layout diagnostic_context *m_context; pretty_printer *m_pp; location_t m_primary_loc; - expanded_location m_exploc; + exploc_with_display_col m_exploc; colorizer m_colorizer; bool m_colorize_source_p; bool m_show_labels_p; @@ -472,10 +514,15 @@ layout_range::layout_range (const expanded_location *start_exploc, - 'w' indicates a point within the range - 'F' indicates the finish of the range (which is within it). - - 'a' indicates a subsequent point *after* the range. */ + - 'a' indicates a subsequent point *after* the range. + + USE_DISPLAY controls whether we check the byte column or + the display column; one or the other is more convenient + depending on the context. */ bool -layout_range::contains_point (linenum_type row, int column) const +layout_range::contains_point (linenum_type row, int column, + bool use_display) const { gcc_assert (m_start.m_line <= m_finish.m_line); /* ...but the equivalent isn't true for the columns; @@ -491,7 +538,7 @@ layout_range::contains_point (linenum_type row, int column) const /* On same line as start of range (corresponding to line 02 in example A and line 03 in example B). */ { - if (column < m_start.m_column) + if (column < m_start.get_col (use_display)) /* Points on the starting line of the range, but before the column in which it begins. */ return false; @@ -505,7 +552,7 @@ layout_range::contains_point (linenum_type row, int column) const { /* This is a single-line range. */ gcc_assert (row == m_finish.m_line); - return column <= m_finish.m_column; + return column <= m_finish.get_col (use_display); } } @@ -530,7 +577,7 @@ layout_range::contains_point (linenum_type row, int column) const gcc_assert (row == m_finish.m_line); - return column <= m_finish.m_column; + return column <= m_finish.get_col (use_display); } /* Does this layout_range contain any part of line ROW? */ @@ -574,20 +621,23 @@ test_layout_range_for_single_point () /* Tests for layout_range::contains_point. */ - /* Before the line. */ - ASSERT_FALSE (point.contains_point (6, 1)); + for (int use_display = 0; use_display <= 1; ++use_display) + { + /* Before the line. */ + ASSERT_FALSE (point.contains_point (6, 1, use_display)); - /* On the line, but before start. */ - ASSERT_FALSE (point.contains_point (7, 9)); + /* On the line, but before start. */ + ASSERT_FALSE (point.contains_point (7, 9, use_display)); - /* At the point. */ - ASSERT_TRUE (point.contains_point (7, 10)); + /* At the point. */ + ASSERT_TRUE (point.contains_point (7, 10, use_display)); - /* On the line, after the point. */ - ASSERT_FALSE (point.contains_point (7, 11)); + /* On the line, after the point. */ + ASSERT_FALSE (point.contains_point (7, 11, use_display)); - /* After the line. */ - ASSERT_FALSE (point.contains_point (8, 1)); + /* After the line. */ + ASSERT_FALSE (point.contains_point (8, 1, use_display)); + } /* Tests for layout_range::intersects_line_p. */ ASSERT_FALSE (point.intersects_line_p (6)); @@ -605,26 +655,29 @@ test_layout_range_for_single_line () /* Tests for layout_range::contains_point. */ - /* Before the line. */ - ASSERT_FALSE (example_a.contains_point (1, 1)); + for (int use_display = 0; use_display <= 1; ++use_display) + { + /* Before the line. */ + ASSERT_FALSE (example_a.contains_point (1, 1, use_display)); - /* On the line, but before start. */ - ASSERT_FALSE (example_a.contains_point (2, 21)); + /* On the line, but before start. */ + ASSERT_FALSE (example_a.contains_point (2, 21, use_display)); - /* On the line, at the start. */ - ASSERT_TRUE (example_a.contains_point (2, 22)); + /* On the line, at the start. */ + ASSERT_TRUE (example_a.contains_point (2, 22, use_display)); - /* On the line, within the range. */ - ASSERT_TRUE (example_a.contains_point (2, 23)); + /* On the line, within the range. */ + ASSERT_TRUE (example_a.contains_point (2, 23, use_display)); - /* On the line, at the end. */ - ASSERT_TRUE (example_a.contains_point (2, 38)); + /* On the line, at the end. */ + ASSERT_TRUE (example_a.contains_point (2, 38, use_display)); - /* On the line, after the end. */ - ASSERT_FALSE (example_a.contains_point (2, 39)); + /* On the line, after the end. */ + ASSERT_FALSE (example_a.contains_point (2, 39, use_display)); - /* After the line. */ - ASSERT_FALSE (example_a.contains_point (2, 39)); + /* After the line. */ + ASSERT_FALSE (example_a.contains_point (2, 39, use_display)); + } /* Tests for layout_range::intersects_line_p. */ ASSERT_FALSE (example_a.intersects_line_p (1)); @@ -642,40 +695,43 @@ test_layout_range_for_multiple_lines () /* Tests for layout_range::contains_point. */ - /* Before first line. */ - ASSERT_FALSE (example_b.contains_point (1, 1)); + for (int use_display = 0; use_display <= 1; ++use_display) + { + /* Before first line. */ + ASSERT_FALSE (example_b.contains_point (1, 1, use_display)); - /* On the first line, but before start. */ - ASSERT_FALSE (example_b.contains_point (3, 13)); + /* On the first line, but before start. */ + ASSERT_FALSE (example_b.contains_point (3, 13, use_display)); - /* At the start. */ - ASSERT_TRUE (example_b.contains_point (3, 14)); + /* At the start. */ + ASSERT_TRUE (example_b.contains_point (3, 14, use_display)); - /* On the first line, within the range. */ - ASSERT_TRUE (example_b.contains_point (3, 15)); + /* On the first line, within the range. */ + ASSERT_TRUE (example_b.contains_point (3, 15, use_display)); - /* On an interior line. - The column number should not matter; try various boundary - values. */ - ASSERT_TRUE (example_b.contains_point (4, 1)); - ASSERT_TRUE (example_b.contains_point (4, 7)); - ASSERT_TRUE (example_b.contains_point (4, 8)); - ASSERT_TRUE (example_b.contains_point (4, 9)); - ASSERT_TRUE (example_b.contains_point (4, 13)); - ASSERT_TRUE (example_b.contains_point (4, 14)); - ASSERT_TRUE (example_b.contains_point (4, 15)); + /* On an interior line. + The column number should not matter; try various boundary + values. */ + ASSERT_TRUE (example_b.contains_point (4, 1, use_display)); + ASSERT_TRUE (example_b.contains_point (4, 7, use_display)); + ASSERT_TRUE (example_b.contains_point (4, 8, use_display)); + ASSERT_TRUE (example_b.contains_point (4, 9, use_display)); + ASSERT_TRUE (example_b.contains_point (4, 13, use_display)); + ASSERT_TRUE (example_b.contains_point (4, 14, use_display)); + ASSERT_TRUE (example_b.contains_point (4, 15, use_display)); - /* On the final line, before the end. */ - ASSERT_TRUE (example_b.contains_point (5, 7)); + /* On the final line, before the end. */ + ASSERT_TRUE (example_b.contains_point (5, 7, use_display)); - /* On the final line, at the end. */ - ASSERT_TRUE (example_b.contains_point (5, 8)); + /* On the final line, at the end. */ + ASSERT_TRUE (example_b.contains_point (5, 8, use_display)); - /* On the final line, after the end. */ - ASSERT_FALSE (example_b.contains_point (5, 9)); + /* On the final line, after the end. */ + ASSERT_FALSE (example_b.contains_point (5, 9, use_display)); - /* After the line. */ - ASSERT_FALSE (example_b.contains_point (6, 1)); + /* After the line. */ + ASSERT_FALSE (example_b.contains_point (6, 1, use_display)); + } /* Tests for layout_range::intersects_line_p. */ ASSERT_FALSE (example_b.intersects_line_p (2)); @@ -687,8 +743,8 @@ test_layout_range_for_multiple_lines () #endif /* #if CHECKING_P */ -/* Given a source line LINE of length LINE_WIDTH, determine the width - without any trailing whitespace. */ +/* Given a source line LINE of length LINE_WIDTH bytes, determine the width + (in bytes, not display cols) without any trailing whitespace. */ static int get_line_width_without_trailing_whitespace (const char *line, int line_width) @@ -897,17 +953,35 @@ layout::layout (diagnostic_context * context, will be adjusted accordingly. */ size_t max_width = m_context->caret_max_width; char_span line = location_get_source_line (m_exploc.file, m_exploc.line); - if (line && (size_t)m_exploc.column <= line.length ()) + if (line && max_width) { - size_t right_margin = CARET_LINE_MARGIN; - size_t column = m_exploc.column; - if (m_show_line_numbers_p) - column += m_linenum_width + 2; - right_margin = MIN (line.length () - column, right_margin); - right_margin = max_width - right_margin; - if (line.length () >= max_width && column > right_margin) - m_x_offset = column - right_margin; - gcc_assert (m_x_offset >= 0); + size_t column = m_exploc.m_display_col; + int line_width + = get_line_width_without_trailing_whitespace (line.get_buffer (), + line.length ()); + size_t eol = cpp_display_width (line.get_buffer (), line_width); + const size_t eol_before_linenum = eol; + + if (column <= eol) + { + if (m_show_line_numbers_p) + { + column += m_linenum_width + 2; + eol += m_linenum_width + 2; + } + size_t right_margin = CARET_LINE_MARGIN; + right_margin = MIN (eol - column, right_margin); + right_margin = max_width - right_margin; + /* Note: if right_margin > max_width, we end up failing this next + check due to wrapping, and we don't offset anything. Otherwise we + would conclude we can't output the line at all. */ + if (eol >= max_width && column > right_margin) + { + m_x_offset = column - right_margin; + m_x_offset = MIN (m_x_offset, (int) eol_before_linenum - 1); + } + gcc_assert (m_x_offset >= 0); + } } if (context->show_ruler_p) @@ -1252,7 +1326,9 @@ layout::calculate_line_spans () /* Print line ROW of source code, potentially colorized at any ranges, and populate *LBOUNDS_OUT. LINE is the source line (not necessarily 0-terminated) and LINE_WIDTH - is its width. */ + is its width. This function deals only with byte offsets, not display + columns; m_x_offset must be converted from display to byte units. In + particular, LINE_WIDTH and LBOUNDS_OUT are in bytes. */ void layout::print_source_line (linenum_type row, const char *line, int line_width, @@ -1264,7 +1340,10 @@ layout::print_source_line (linenum_type row, const char *line, int line_width, whitespace. */ line_width = get_line_width_without_trailing_whitespace (line, line_width); - line += m_x_offset; + + const int x_offset_bytes + = cpp_display_column_to_byte_column (line, line_width, m_x_offset); + line += x_offset_bytes; if (m_show_line_numbers_p) { @@ -1278,7 +1357,7 @@ layout::print_source_line (linenum_type row, const char *line, int line_width, int first_non_ws = INT_MAX; int last_non_ws = 0; int column; - for (column = 1 + m_x_offset; column <= line_width; column++) + for (column = 1 + x_offset_bytes; column <= line_width; column++) { /* Assuming colorization is enabled for the caret and underline characters, we may also colorize the associated characters @@ -1298,6 +1377,8 @@ layout::print_source_line (linenum_type row, const char *line, int line_width, point_state state; in_range_p = get_state_at_point (row, column, 0, INT_MAX, + false, /* Using bytes, not display + columns, here. */ &state); if (in_range_p) m_colorizer.set_range (state.range_idx); @@ -1360,12 +1441,13 @@ layout::start_annotation_line (char margin_char) const } /* Print a line consisting of the caret/underlines for the given - source line. */ + source line. This function works with display columns, rather than byte + counts; in particular, LBOUNDS should be in display column units. */ void layout::print_annotation_line (linenum_type row, const line_bounds lbounds) { - int x_bound = get_x_bound_for_row (row, m_exploc.column, + int x_bound = get_x_bound_for_row (row, m_exploc.m_display_col, lbounds.m_last_non_ws); start_annotation_line (); @@ -1378,6 +1460,7 @@ layout::print_annotation_line (linenum_type row, const line_bounds lbounds) in_range_p = get_state_at_point (row, column, lbounds.m_first_non_ws, lbounds.m_last_non_ws, + true, /* Using display units. */ &state); if (in_range_p) { @@ -1415,9 +1498,11 @@ class line_label public: line_label (int state_idx, int column, label_text text) : m_state_idx (state_idx), m_column (column), - m_text (text), m_length (strlen (text.m_buffer)), - m_label_line (0) - {} + m_text (text), m_label_line (0) + { + const int bytes = strlen (text.m_buffer); + m_length = cpp_display_width (text.m_buffer, bytes); + } /* Sorting is primarily by column, then by state index. */ static int comparator (const void *p1, const void *p2) @@ -1459,7 +1544,7 @@ layout::print_any_labels (linenum_type row) /* Reject labels that aren't fully visible due to clipping by m_x_offset. */ - if (range->m_caret.m_column <= m_x_offset) + if (range->m_caret.m_display_col <= m_x_offset) continue; label_text text; @@ -1471,7 +1556,7 @@ layout::print_any_labels (linenum_type row) if (text.m_buffer == NULL) continue; - labels.safe_push (line_label (i, range->m_caret.m_column, text)); + labels.safe_push (line_label (i, range->m_caret.m_display_col, text)); } } @@ -1624,7 +1709,7 @@ layout::print_leading_fixits (linenum_type row) /* Subroutine of layout::print_trailing_fixits. Determine if the annotation line printed for LINE contained - the exact range from START_COLUMN to FINISH_COLUMN. */ + the exact range from START_COLUMN to FINISH_COLUMN (in display units). */ bool layout::annotation_line_showed_range_p (linenum_type line, int start_column, @@ -1634,9 +1719,9 @@ layout::annotation_line_showed_range_p (linenum_type line, int start_column, int i; FOR_EACH_VEC_ELT (m_layout_ranges, i, range) if (range->m_start.m_line == line - && range->m_start.m_column == start_column + && range->m_start.m_display_col == start_column && range->m_finish.m_line == line - && range->m_finish.m_column == finish_column) + && range->m_finish.m_display_col == finish_column) return true; return false; } @@ -1723,7 +1808,7 @@ layout::annotation_line_showed_range_p (linenum_type line, int start_column, and is thus printed as desired. */ -/* A range of columns within a line. */ +/* A range of (byte or display) columns within a line. */ class column_range { @@ -1743,32 +1828,51 @@ public: int finish; }; -/* Get the range of columns that HINT would affect. */ - +/* Get the range of bytes or display columns that HINT would affect. */ static column_range -get_affected_columns (const fixit_hint *hint) +get_affected_range (const fixit_hint *hint, bool use_display) { - int start_column = LOCATION_COLUMN (hint->get_start_loc ()); - int finish_column = LOCATION_COLUMN (hint->get_next_loc ()) - 1; + expanded_location exploc_start = expand_location (hint->get_start_loc ()); + expanded_location exploc_finish = expand_location (hint->get_next_loc ()); + --exploc_finish.column; + int start_column; + int finish_column; + if (use_display) + { + start_column = location_compute_display_column (exploc_start); + if (hint->insertion_p ()) + finish_column = start_column - 1; + else + finish_column = location_compute_display_column (exploc_finish); + } + else + { + start_column = exploc_start.column; + finish_column = exploc_finish.column; + } return column_range (start_column, finish_column); } -/* Get the range of columns that would be printed for HINT. */ +/* Get the range of display columns that would be printed for HINT. */ static column_range get_printed_columns (const fixit_hint *hint) { - int start_column = LOCATION_COLUMN (hint->get_start_loc ()); - int final_hint_column = start_column + hint->get_length () - 1; + expanded_location exploc = expand_location (hint->get_start_loc ()); + int start_column = location_compute_display_column (exploc); + int hint_width = cpp_display_width (hint->get_string (), + hint->get_length ()); + int final_hint_column = start_column + hint_width - 1; if (hint->insertion_p ()) { return column_range (start_column, final_hint_column); } else { - int finish_column = LOCATION_COLUMN (hint->get_next_loc ()) - 1; - + exploc = expand_location (hint->get_next_loc ()); + --exploc.column; + int finish_column = location_compute_display_column (exploc); return column_range (start_column, MAX (finish_column, final_hint_column)); } @@ -1782,27 +1886,35 @@ get_printed_columns (const fixit_hint *hint) class correction { public: - correction (column_range affected_columns, + correction (column_range affected_bytes, + column_range affected_columns, column_range printed_columns, const char *new_text, size_t new_text_len) - : m_affected_columns (affected_columns), + : m_affected_bytes (affected_bytes), + m_affected_columns (affected_columns), m_printed_columns (printed_columns), m_text (xstrdup (new_text)), - m_len (new_text_len), + m_bytes (new_text_len), m_alloc_sz (new_text_len + 1) { + compute_display_cols (); } ~correction () { free (m_text); } bool insertion_p () const { - return m_affected_columns.start == m_affected_columns.finish + 1; + return m_affected_bytes.start == m_affected_bytes.finish + 1; } void ensure_capacity (size_t len); void ensure_terminated (); + void compute_display_cols () + { + m_display_cols = cpp_display_width (m_text, m_bytes); + } + void overwrite (int dst_offset, const char_span &src_span) { gcc_assert (dst_offset >= 0); @@ -1815,6 +1927,7 @@ public: is to be inserted, and finish is offset by the length of the replacement. If replace, then the range of columns affected. */ + column_range m_affected_bytes; column_range m_affected_columns; /* If insert, then start: the column before which the text @@ -1825,7 +1938,8 @@ public: /* The text to be inserted/used as replacement. */ char *m_text; - size_t m_len; + size_t m_bytes; + int m_display_cols; size_t m_alloc_sz; }; @@ -1850,8 +1964,8 @@ void correction::ensure_terminated () { /* 0-terminate the buffer. */ - gcc_assert (m_len < m_alloc_sz); - m_text[m_len] = '\0'; + gcc_assert (m_bytes < m_alloc_sz); + m_text[m_bytes] = '\0'; } /* A list of corrections affecting a particular line. @@ -1913,7 +2027,8 @@ source_line::source_line (const char *filename, int line) void line_corrections::add_hint (const fixit_hint *hint) { - column_range affected_columns = get_affected_columns (hint); + column_range affected_bytes = get_affected_range (hint, false); + column_range affected_columns = get_affected_range (hint, true); column_range printed_columns = get_printed_columns (hint); /* Potentially consolidate. */ @@ -1924,8 +2039,8 @@ line_corrections::add_hint (const fixit_hint *hint) /* The following consolidation code assumes that the fix-it hints have been sorted by start (done within layout's ctor). */ - gcc_assert (affected_columns.start - >= last_correction->m_affected_columns.start); + gcc_assert (affected_bytes.start + >= last_correction->m_affected_bytes.start); gcc_assert (printed_columns.start >= last_correction->m_printed_columns.start); @@ -1937,8 +2052,8 @@ line_corrections::add_hint (const fixit_hint *hint) Attempt to inject a "replace" correction from immediately after the end of the last hint to immediately before the start of the next hint. */ - column_range between (last_correction->m_affected_columns.finish + 1, - printed_columns.start - 1); + column_range between (last_correction->m_affected_bytes.finish + 1, + affected_bytes.start - 1); /* Try to read the source. */ source_line line (m_filename, m_row); @@ -1947,7 +2062,7 @@ line_corrections::add_hint (const fixit_hint *hint) /* Consolidate into the last correction: add a no-op "replace" of the "between" text, and add the text from the new hint. */ - int old_len = last_correction->m_len; + int old_len = last_correction->m_bytes; gcc_assert (old_len >= 0); int between_len = between.finish + 1 - between.start; gcc_assert (between_len >= 0); @@ -1961,19 +2076,24 @@ line_corrections::add_hint (const fixit_hint *hint) last_correction->overwrite (old_len + between_len, char_span (hint->get_string (), hint->get_length ())); - last_correction->m_len = new_len; + last_correction->m_bytes = new_len; last_correction->ensure_terminated (); + last_correction->m_affected_bytes.finish + = affected_bytes.finish; last_correction->m_affected_columns.finish = affected_columns.finish; + int prev_display_cols = last_correction->m_display_cols; + last_correction->compute_display_cols (); last_correction->m_printed_columns.finish - += between_len + hint->get_length (); + += last_correction->m_display_cols - prev_display_cols; return; } } } /* If no consolidation happened, add a new correction instance. */ - m_corrections.safe_push (new correction (affected_columns, + m_corrections.safe_push (new correction (affected_bytes, + affected_columns, printed_columns, hint->get_string (), hint->get_length ())); @@ -2022,7 +2142,7 @@ layout::print_trailing_fixits (linenum_type row) m_colorizer.set_fixit_insert (); pp_string (m_pp, c->m_text); m_colorizer.set_normal_text (); - column += c->m_len; + column += c->m_display_cols; } else { @@ -2034,7 +2154,7 @@ layout::print_trailing_fixits (linenum_type row) int finish_column = c->m_affected_columns.finish; if (!annotation_line_showed_range_p (row, start_column, finish_column) - || c->m_len == 0) + || c->m_bytes == 0) { move_to_column (&column, start_column, true); m_colorizer.set_fixit_delete (); @@ -2045,13 +2165,13 @@ layout::print_trailing_fixits (linenum_type row) /* Print the replacement text. REPLACE also covers removals, so only do this extra work (potentially starting a new line) if we have actual replacement text. */ - if (c->m_len > 0) + if (c->m_bytes > 0) { move_to_column (&column, start_column, true); m_colorizer.set_fixit_insert (); pp_string (m_pp, c->m_text); m_colorizer.set_normal_text (); - column += c->m_len; + column += c->m_display_cols; } } } @@ -2072,12 +2192,14 @@ layout::print_newline () /* Return true if (ROW/COLUMN) is within a range of the layout. If it returns true, OUT_STATE is written to, with the range index, and whether we should draw the caret at - (ROW/COLUMN) (as opposed to an underline). */ + (ROW/COLUMN) (as opposed to an underline). USE_DISPLAY controls + whether all inputs and outputs are in bytes or display column units. */ bool layout::get_state_at_point (/* Inputs. */ linenum_type row, int column, int first_non_ws, int last_non_ws, + bool use_display, /* Outputs. */ point_state *out_state) { @@ -2090,7 +2212,7 @@ layout::get_state_at_point (/* Inputs. */ source colorization. */ continue; - if (range->contains_point (row, column)) + if (range->contains_point (row, column, use_display)) { out_state->range_idx = i; @@ -2098,7 +2220,7 @@ layout::get_state_at_point (/* Inputs. */ out_state->draw_caret_p = false; if (range->m_range_display_kind == SHOW_RANGE_WITH_CARET && row == range->m_caret.m_line - && column == range->m_caret.m_column) + && column == range->m_caret.get_col (use_display)) out_state->draw_caret_p = true; /* Within a multiline range, don't display any underline @@ -2118,11 +2240,11 @@ layout::get_state_at_point (/* Inputs. */ /* Helper function for use by layout::print_line when printing the annotation line under the source line. - Get the column beyond the rightmost one that could contain a caret or - range marker, given that we stop rendering at trailing whitespace. + Get the display column beyond the rightmost one that could contain a caret + or range marker, given that we stop rendering at trailing whitespace. ROW is the source line within the given file. - CARET_COLUMN is the column of range 0's caret. - LAST_NON_WS_COLUMN is the last column containing a non-whitespace + CARET_COLUMN is the display column of range 0's caret. + LAST_NON_WS_COLUMN is the last display column containing a non-whitespace character of source (as determined when printing the source line). */ int @@ -2141,8 +2263,8 @@ layout::get_x_bound_for_row (linenum_type row, int caret_column, { /* On the final line within a range; ensure that we render up to the end of the range. */ - if (result <= range->m_finish.m_column) - result = range->m_finish.m_column + 1; + if (result <= range->m_finish.m_display_col) + result = range->m_finish.m_display_col + 1; } else if (row < range->m_finish.m_line) { @@ -2233,7 +2355,11 @@ layout::print_line (linenum_type row) print_leading_fixits (row); print_source_line (row, line.get_buffer (), line.length (), &lbounds); if (should_print_annotation_line_p (row)) - print_annotation_line (row, lbounds); + { + if (lbounds.m_first_non_ws != INT_MAX) + lbounds.convert_to_display_cols (line); + print_annotation_line (row, lbounds); + } if (m_show_labels_p) print_any_labels (row); print_trailing_fixits (row); @@ -2846,6 +2972,560 @@ test_diagnostic_show_locus_one_liner (const line_table_case &case_) test_one_liner_labels (); } +/* Version of all one-liner tests exercising multibyte awareness. For + simplicity we stick to using two multibyte characters in the test, U+1F602 + == "\xf0\x9f\x98\x82", which uses 4 bytes and 2 display columns, and U+03C0 + == "\xcf\x80", which uses 2 bytes and 1 display column. Note: all of the + below asserts would be easier to read if we used UTF-8 directly in the + string constants, but it seems better not to demand the host compiler + support this, when it isn't otherwise necessary. Instead, whenever an + extended character appears in a string, we put a line break after it so that + all succeeding characters can appear visually at the correct display column. + + All of these work on the following 1-line source file: + + .0000000001111111111222222 display + .1234567890123456789012345 columns + "SS_foo = P_bar.SS_fieldP;\n" + .0000000111111111222222223 byte + .1356789012456789134567891 columns + + which is set up by test_diagnostic_show_locus_one_liner and calls + them. Here SS represents the two display columns for the U+1F602 emoji and + P represents the one display column for the U+03C0 pi symbol. */ + +/* Just a caret. */ + +static void +test_one_liner_simple_caret_utf8 () +{ + test_diagnostic_context dc; + location_t caret = linemap_position_for_column (line_table, 18); + rich_location richloc (line_table, caret); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^\n", + pp_formatted_text (dc.printer)); +} + +/* Caret and range. */ +static void +test_one_liner_caret_and_range_utf8 () +{ + test_diagnostic_context dc; + location_t caret = linemap_position_for_column (line_table, 18); + location_t start = linemap_position_for_column (line_table, 12); + location_t finish = linemap_position_for_column (line_table, 30); + location_t loc = make_location (caret, start, finish); + rich_location richloc (line_table, loc); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ~~~~~^~~~~~~~~~\n", + pp_formatted_text (dc.printer)); +} + +/* Multiple ranges and carets. */ + +static void +test_one_liner_multiple_carets_and_ranges_utf8 () +{ + test_diagnostic_context dc; + location_t foo + = make_location (linemap_position_for_column (line_table, 7), + linemap_position_for_column (line_table, 1), + linemap_position_for_column (line_table, 8)); + dc.caret_chars[0] = 'A'; + + location_t bar + = make_location (linemap_position_for_column (line_table, 16), + linemap_position_for_column (line_table, 12), + linemap_position_for_column (line_table, 17)); + dc.caret_chars[1] = 'B'; + + location_t field + = make_location (linemap_position_for_column (line_table, 26), + linemap_position_for_column (line_table, 19), + linemap_position_for_column (line_table, 30)); + dc.caret_chars[2] = 'C'; + rich_location richloc (line_table, foo); + richloc.add_range (bar, SHOW_RANGE_WITH_CARET); + richloc.add_range (field, SHOW_RANGE_WITH_CARET); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ~~~~A~ ~~~B~ ~~~~~C~~~\n", + pp_formatted_text (dc.printer)); +} + +/* Insertion fix-it hint: adding an "&" to the front of "P_bar.field". */ + +static void +test_one_liner_fixit_insert_before_utf8 () +{ + test_diagnostic_context dc; + location_t caret = linemap_position_for_column (line_table, 12); + rich_location richloc (line_table, caret); + richloc.add_fixit_insert_before ("&"); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^\n" + " &\n", + pp_formatted_text (dc.printer)); +} + +/* Insertion fix-it hint: adding a "[0]" after "SS_foo". */ + +static void +test_one_liner_fixit_insert_after_utf8 () +{ + test_diagnostic_context dc; + location_t start = linemap_position_for_column (line_table, 1); + location_t finish = linemap_position_for_column (line_table, 8); + location_t foo = make_location (start, start, finish); + rich_location richloc (line_table, foo); + richloc.add_fixit_insert_after ("[0]"); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^~~~~~\n" + " [0]\n", + pp_formatted_text (dc.printer)); +} + +/* Removal fix-it hint: removal of the ".SS_fieldP". */ + +static void +test_one_liner_fixit_remove_utf8 () +{ + test_diagnostic_context dc; + location_t start = linemap_position_for_column (line_table, 18); + location_t finish = linemap_position_for_column (line_table, 30); + location_t dot = make_location (start, start, finish); + rich_location richloc (line_table, dot); + richloc.add_fixit_remove (); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^~~~~~~~~~\n" + " ----------\n", + pp_formatted_text (dc.printer)); +} + +/* Replace fix-it hint: replacing "SS_fieldP" with "m_SSfieldP". */ + +static void +test_one_liner_fixit_replace_utf8 () +{ + test_diagnostic_context dc; + location_t start = linemap_position_for_column (line_table, 19); + location_t finish = linemap_position_for_column (line_table, 30); + location_t field = make_location (start, start, finish); + rich_location richloc (line_table, field); + richloc.add_fixit_replace ("m_\xf0\x9f\x98\x82_field\xcf\x80"); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^~~~~~~~~\n" + " m_\xf0\x9f\x98\x82" + "_field\xcf\x80\n", + pp_formatted_text (dc.printer)); +} + +/* Replace fix-it hint: replacing "SS_fieldP" with "m_SSfieldP", + but where the caret was elsewhere. */ + +static void +test_one_liner_fixit_replace_non_equal_range_utf8 () +{ + test_diagnostic_context dc; + location_t equals = linemap_position_for_column (line_table, 10); + location_t start = linemap_position_for_column (line_table, 19); + location_t finish = linemap_position_for_column (line_table, 30); + rich_location richloc (line_table, equals); + source_range range; + range.m_start = start; + range.m_finish = finish; + richloc.add_fixit_replace (range, "m_\xf0\x9f\x98\x82_field\xcf\x80"); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + /* The replacement range is not indicated in the annotation line, so + it should be indicated via an additional underline. */ + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^\n" + " ---------\n" + " m_\xf0\x9f\x98\x82" + "_field\xcf\x80\n", + pp_formatted_text (dc.printer)); +} + +/* Replace fix-it hint: replacing "SS_fieldP" with "m_SSfieldP", + where the caret was elsewhere, but where a secondary range + exactly covers "field". */ + +static void +test_one_liner_fixit_replace_equal_secondary_range_utf8 () +{ + test_diagnostic_context dc; + location_t equals = linemap_position_for_column (line_table, 10); + location_t start = linemap_position_for_column (line_table, 19); + location_t finish = linemap_position_for_column (line_table, 30); + rich_location richloc (line_table, equals); + location_t field = make_location (start, start, finish); + richloc.add_range (field); + richloc.add_fixit_replace (field, "m_\xf0\x9f\x98\x82_field\xcf\x80"); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + /* The replacement range is indicated in the annotation line, + so it shouldn't be indicated via an additional underline. */ + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^ ~~~~~~~~~\n" + " m_\xf0\x9f\x98\x82" + "_field\xcf\x80\n", + pp_formatted_text (dc.printer)); +} + +/* Verify that we can use ad-hoc locations when adding fixits to a + rich_location. */ + +static void +test_one_liner_fixit_validation_adhoc_locations_utf8 () +{ + /* Generate a range that's too long to be packed, so must + be stored as an ad-hoc location (given the defaults + of 5 bits or 0 bits of packed range); 41 columns > 2**5. */ + const location_t c12 = linemap_position_for_column (line_table, 12); + const location_t c52 = linemap_position_for_column (line_table, 52); + const location_t loc = make_location (c12, c12, c52); + + if (c52 > LINE_MAP_MAX_LOCATION_WITH_COLS) + return; + + ASSERT_TRUE (IS_ADHOC_LOC (loc)); + + /* Insert. */ + { + rich_location richloc (line_table, loc); + richloc.add_fixit_insert_before (loc, "test"); + /* It should not have been discarded by the validator. */ + ASSERT_EQ (1, richloc.get_num_fixit_hints ()); + + test_diagnostic_context dc; + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^~~~~~~~~~~~~~~~ \n" + " test\n", + pp_formatted_text (dc.printer)); + } + + /* Remove. */ + { + rich_location richloc (line_table, loc); + source_range range = source_range::from_locations (loc, c52); + richloc.add_fixit_remove (range); + /* It should not have been discarded by the validator. */ + ASSERT_EQ (1, richloc.get_num_fixit_hints ()); + + test_diagnostic_context dc; + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^~~~~~~~~~~~~~~~ \n" + " -------------------------------------\n", + pp_formatted_text (dc.printer)); + } + + /* Replace. */ + { + rich_location richloc (line_table, loc); + source_range range = source_range::from_locations (loc, c52); + richloc.add_fixit_replace (range, "test"); + /* It should not have been discarded by the validator. */ + ASSERT_EQ (1, richloc.get_num_fixit_hints ()); + + test_diagnostic_context dc; + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^~~~~~~~~~~~~~~~ \n" + " test\n", + pp_formatted_text (dc.printer)); + } +} + +/* Test of consolidating insertions at the same location. */ + +static void +test_one_liner_many_fixits_1_utf8 () +{ + test_diagnostic_context dc; + location_t equals = linemap_position_for_column (line_table, 10); + rich_location richloc (line_table, equals); + for (int i = 0; i < 19; i++) + richloc.add_fixit_insert_before (i & 1 ? "@" : "\xcf\x80"); + ASSERT_EQ (1, richloc.get_num_fixit_hints ()); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^\n" + " \xcf\x80@\xcf\x80@\xcf\x80@\xcf\x80@\xcf\x80@" + "\xcf\x80@\xcf\x80@\xcf\x80@\xcf\x80@\xcf\x80\n", + pp_formatted_text (dc.printer)); +} + +/* Ensure that we can add an arbitrary number of fix-it hints to a + rich_location, even if they are not consolidated. */ + +static void +test_one_liner_many_fixits_2_utf8 () +{ + test_diagnostic_context dc; + location_t equals = linemap_position_for_column (line_table, 10); + rich_location richloc (line_table, equals); + const int nlocs = 19; + int locs[nlocs] = {1, 5, 7, 9, 11, 14, 16, 18, 23, 25, 27, 29, 32, + 34, 36, 38, 40, 42, 44}; + for (int i = 0; i != nlocs; ++i) + { + location_t loc = linemap_position_for_column (line_table, locs[i]); + richloc.add_fixit_insert_before (loc, i & 1 ? "@" : "\xcf\x80"); + } + + ASSERT_EQ (nlocs, richloc.get_num_fixit_hints ()); + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^\n" + " \xcf\x80 @ \xcf\x80 @ \xcf\x80 @ \xcf\x80 @ \xcf\x80 @" + " \xcf\x80 @ \xcf\x80 @ \xcf\x80 @ \xcf\x80 @ \xcf\x80\n", + pp_formatted_text (dc.printer)); +} + +/* Test of labeling the ranges within a rich_location. */ + +static void +test_one_liner_labels_utf8 () +{ + location_t foo + = make_location (linemap_position_for_column (line_table, 1), + linemap_position_for_column (line_table, 1), + linemap_position_for_column (line_table, 8)); + location_t bar + = make_location (linemap_position_for_column (line_table, 12), + linemap_position_for_column (line_table, 12), + linemap_position_for_column (line_table, 17)); + location_t field + = make_location (linemap_position_for_column (line_table, 19), + linemap_position_for_column (line_table, 19), + linemap_position_for_column (line_table, 30)); + + /* Example where all the labels fit on one line. */ + { + text_range_label label0 + ("\xcf\x80\xcf\x80\xcf\x80\xcf\x80\xcf\x80\xcf\x80"); + text_range_label label1 + ("\xf0\x9f\x98\x82\xf0\x9f\x98\x82\xcf\x80"); + text_range_label label2 + ("\xf0\x9f\x98\x82\xcf\x80\xf0\x9f\x98\x82\xf0\x9f\x98\x82\xcf\x80" + "\xcf\x80"); + gcc_rich_location richloc (foo, &label0); + richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1); + richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2); + + { + test_diagnostic_context dc; + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^~~~~~ ~~~~~ ~~~~~~~~~\n" + " | | |\n" + " \xcf\x80\xcf\x80\xcf\x80\xcf\x80\xcf\x80\xcf\x80" + " \xf0\x9f\x98\x82\xf0\x9f\x98\x82\xcf\x80" + " \xf0\x9f\x98\x82\xcf\x80\xf0\x9f\x98\x82" + "\xf0\x9f\x98\x82\xcf\x80\xcf\x80\n", + pp_formatted_text (dc.printer)); + } + + /* Verify that we can disable label-printing. */ + { + test_diagnostic_context dc; + dc.show_labels_p = false; + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^~~~~~ ~~~~~ ~~~~~~~~~\n", + pp_formatted_text (dc.printer)); + } + } + + /* Example where the labels need extra lines. */ + { + text_range_label label0 ("label 0\xf0\x9f\x98\x82"); + text_range_label label1 ("label 1\xcf\x80"); + text_range_label label2 ("label 2\xcf\x80"); + gcc_rich_location richloc (foo, &label0); + richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1); + richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2); + + test_diagnostic_context dc; + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^~~~~~ ~~~~~ ~~~~~~~~~\n" + " | | |\n" + " | | label 2\xcf\x80\n" + " | label 1\xcf\x80\n" + " label 0\xf0\x9f\x98\x82\n", + pp_formatted_text (dc.printer)); + } + + /* Example of boundary conditions: label 0 and 1 have just enough clearance, + but label 1 just touches label 2. */ + { + text_range_label label0 ("aaaaa\xf0\x9f\x98\x82\xcf\x80"); + text_range_label label1 ("bb\xf0\x9f\x98\x82\xf0\x9f\x98\x82"); + text_range_label label2 ("c"); + gcc_rich_location richloc (foo, &label0); + richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1); + richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2); + + test_diagnostic_context dc; + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " \xf0\x9f\x98\x82" + "_foo = \xcf\x80" + "_bar.\xf0\x9f\x98\x82" + "_field\xcf\x80" + ";\n" + " ^~~~~~ ~~~~~ ~~~~~~~~~\n" + " | | |\n" + " | | c\n" + " aaaaa\xf0\x9f\x98\x82\xcf\x80" + " bb\xf0\x9f\x98\x82\xf0\x9f\x98\x82\n", + pp_formatted_text (dc.printer)); + } +} + +/* Run the various one-liner tests. */ + +static void +test_diagnostic_show_locus_one_liner_utf8 (const line_table_case &case_) +{ + /* Create a tempfile and write some text to it. */ + const char *content + /* Display columns. + 0000000000000000000000011111111111111111111111111111112222222222222 + 1111111122222222345678900000000123456666666677777777890123444444445 */ + = "\xf0\x9f\x98\x82_foo = \xcf\x80_bar.\xf0\x9f\x98\x82_field\xcf\x80;\n"; + /* 0000000000000000000001111111111111111111222222222222222222222233333 + 1111222233334444567890122223333456789999000011112222345678999900001 + Byte columns. */ + temp_source_file tmp (SELFTEST_LOCATION, ".c", content); + line_table_test ltt (case_); + + linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1); + + location_t line_end = linemap_position_for_column (line_table, 31); + + /* Don't attempt to run the tests if column data might be unavailable. */ + if (line_end > LINE_MAP_MAX_LOCATION_WITH_COLS) + return; + + ASSERT_STREQ (tmp.get_filename (), LOCATION_FILE (line_end)); + ASSERT_EQ (1, LOCATION_LINE (line_end)); + ASSERT_EQ (31, LOCATION_COLUMN (line_end)); + + char_span lspan = location_get_source_line (tmp.get_filename (), 1); + ASSERT_EQ (25, cpp_display_width (lspan.get_buffer (), lspan.length ())); + ASSERT_EQ (25, location_compute_display_column (expand_location (line_end))); + + test_one_liner_simple_caret_utf8 (); + test_one_liner_caret_and_range_utf8 (); + test_one_liner_multiple_carets_and_ranges_utf8 (); + test_one_liner_fixit_insert_before_utf8 (); + test_one_liner_fixit_insert_after_utf8 (); + test_one_liner_fixit_remove_utf8 (); + test_one_liner_fixit_replace_utf8 (); + test_one_liner_fixit_replace_non_equal_range_utf8 (); + test_one_liner_fixit_replace_equal_secondary_range_utf8 (); + test_one_liner_fixit_validation_adhoc_locations_utf8 (); + test_one_liner_many_fixits_1_utf8 (); + test_one_liner_many_fixits_2_utf8 (); + test_one_liner_labels_utf8 (); +} + /* Verify that gcc_rich_location::add_location_if_nearby works. */ static void @@ -3221,13 +3901,16 @@ test_overlapped_fixit_printing (const line_table_case &case_) /* Unit-test the line_corrections machinery. */ ASSERT_EQ (3, richloc.get_num_fixit_hints ()); const fixit_hint *hint_0 = richloc.get_fixit_hint (0); - ASSERT_EQ (column_range (12, 12), get_affected_columns (hint_0)); + ASSERT_EQ (column_range (12, 12), get_affected_range (hint_0, false)); + ASSERT_EQ (column_range (12, 12), get_affected_range (hint_0, true)); ASSERT_EQ (column_range (12, 22), get_printed_columns (hint_0)); const fixit_hint *hint_1 = richloc.get_fixit_hint (1); - ASSERT_EQ (column_range (18, 18), get_affected_columns (hint_1)); + ASSERT_EQ (column_range (18, 18), get_affected_range (hint_1, false)); + ASSERT_EQ (column_range (18, 18), get_affected_range (hint_1, true)); ASSERT_EQ (column_range (18, 20), get_printed_columns (hint_1)); const fixit_hint *hint_2 = richloc.get_fixit_hint (2); - ASSERT_EQ (column_range (29, 28), get_affected_columns (hint_2)); + ASSERT_EQ (column_range (29, 28), get_affected_range (hint_2, false)); + ASSERT_EQ (column_range (29, 28), get_affected_range (hint_2, true)); ASSERT_EQ (column_range (29, 29), get_printed_columns (hint_2)); /* Add each hint in turn to a line_corrections instance, @@ -3238,6 +3921,7 @@ test_overlapped_fixit_printing (const line_table_case &case_) /* The first replace hint by itself. */ lc.add_hint (hint_0); ASSERT_EQ (1, lc.m_corrections.length ()); + ASSERT_EQ (column_range (12, 12), lc.m_corrections[0]->m_affected_bytes); ASSERT_EQ (column_range (12, 12), lc.m_corrections[0]->m_affected_columns); ASSERT_EQ (column_range (12, 22), lc.m_corrections[0]->m_printed_columns); ASSERT_STREQ ("const_cast<", lc.m_corrections[0]->m_text); @@ -3247,6 +3931,7 @@ test_overlapped_fixit_printing (const line_table_case &case_) lc.add_hint (hint_1); ASSERT_EQ (1, lc.m_corrections.length ()); ASSERT_STREQ ("const_cast (", lc.m_corrections[0]->m_text); + ASSERT_EQ (column_range (12, 18), lc.m_corrections[0]->m_affected_bytes); ASSERT_EQ (column_range (12, 18), lc.m_corrections[0]->m_affected_columns); ASSERT_EQ (column_range (12, 30), lc.m_corrections[0]->m_printed_columns); @@ -3256,6 +3941,7 @@ test_overlapped_fixit_printing (const line_table_case &case_) ASSERT_STREQ ("const_cast (ptr->field)", lc.m_corrections[0]->m_text); ASSERT_EQ (1, lc.m_corrections.length ()); + ASSERT_EQ (column_range (12, 28), lc.m_corrections[0]->m_affected_bytes); ASSERT_EQ (column_range (12, 28), lc.m_corrections[0]->m_affected_columns); ASSERT_EQ (column_range (12, 41), lc.m_corrections[0]->m_printed_columns); } @@ -3358,6 +4044,243 @@ test_overlapped_fixit_printing (const line_table_case &case_) } } +/* Multibyte-aware version of preceding tests. See comments above + test_one_liner_simple_caret_utf8() too, we use the same two multibyte + characters here. */ + +static void +test_overlapped_fixit_printing_utf8 (const line_table_case &case_) +{ + /* Create a tempfile and write some text to it. */ + + const char *content + /* Display columns. + 00000000000000000000000111111111111111111111111222222222222222223 + 12344444444555555556789012344444444555555556789012345678999999990 */ + = " f\xf0\x9f\x98\x82 *f = (f\xf0\x9f\x98\x82 *)ptr->field\xcf\x80;\n"; + /* 00000000000000000000011111111111111111111112222222222333333333333 + 12344445555666677778901234566667777888899990123456789012333344445 + Byte columns. */ + + temp_source_file tmp (SELFTEST_LOCATION, ".C", content); + line_table_test ltt (case_); + + const line_map_ordinary *ord_map + = linemap_check_ordinary (linemap_add (line_table, LC_ENTER, false, + tmp.get_filename (), 0)); + + linemap_line_start (line_table, 1, 100); + + const location_t final_line_end + = linemap_position_for_line_and_column (line_table, ord_map, 6, 50); + + /* Don't attempt to run the tests if column data might be unavailable. */ + if (final_line_end > LINE_MAP_MAX_LOCATION_WITH_COLS) + return; + + /* A test for converting a C-style cast to a C++-style cast. */ + const location_t open_paren + = linemap_position_for_line_and_column (line_table, ord_map, 1, 14); + const location_t close_paren + = linemap_position_for_line_and_column (line_table, ord_map, 1, 22); + const location_t expr_start + = linemap_position_for_line_and_column (line_table, ord_map, 1, 23); + const location_t expr_finish + = linemap_position_for_line_and_column (line_table, ord_map, 1, 34); + const location_t expr = make_location (expr_start, expr_start, expr_finish); + + /* Various examples of fix-it hints that aren't themselves consolidated, + but for which the *printing* may need consolidation. */ + + /* Example where 3 fix-it hints are printed as one. */ + { + test_diagnostic_context dc; + rich_location richloc (line_table, expr); + richloc.add_fixit_replace (open_paren, "const_cast<"); + richloc.add_fixit_replace (close_paren, "> ("); + richloc.add_fixit_insert_after (")"); + + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " f\xf0\x9f\x98\x82" + " *f = (f\xf0\x9f\x98\x82" + " *)ptr->field\xcf\x80" + ";\n" + " ^~~~~~~~~~~\n" + " ------------------\n" + " const_cast (ptr->field\xcf\x80" + ")\n", + pp_formatted_text (dc.printer)); + + /* Unit-test the line_corrections machinery. */ + ASSERT_EQ (3, richloc.get_num_fixit_hints ()); + const fixit_hint *hint_0 = richloc.get_fixit_hint (0); + ASSERT_EQ (column_range (14, 14), get_affected_range (hint_0, false)); + ASSERT_EQ (column_range (12, 12), get_affected_range (hint_0, true)); + ASSERT_EQ (column_range (12, 22), get_printed_columns (hint_0)); + const fixit_hint *hint_1 = richloc.get_fixit_hint (1); + ASSERT_EQ (column_range (22, 22), get_affected_range (hint_1, false)); + ASSERT_EQ (column_range (18, 18), get_affected_range (hint_1, true)); + ASSERT_EQ (column_range (18, 20), get_printed_columns (hint_1)); + const fixit_hint *hint_2 = richloc.get_fixit_hint (2); + ASSERT_EQ (column_range (35, 34), get_affected_range (hint_2, false)); + ASSERT_EQ (column_range (30, 29), get_affected_range (hint_2, true)); + ASSERT_EQ (column_range (30, 30), get_printed_columns (hint_2)); + + /* Add each hint in turn to a line_corrections instance, + and verify that they are consolidated into one correction instance + as expected. */ + line_corrections lc (tmp.get_filename (), 1); + + /* The first replace hint by itself. */ + lc.add_hint (hint_0); + ASSERT_EQ (1, lc.m_corrections.length ()); + ASSERT_EQ (column_range (14, 14), lc.m_corrections[0]->m_affected_bytes); + ASSERT_EQ (column_range (12, 12), lc.m_corrections[0]->m_affected_columns); + ASSERT_EQ (column_range (12, 22), lc.m_corrections[0]->m_printed_columns); + ASSERT_STREQ ("const_cast<", lc.m_corrections[0]->m_text); + + /* After the second replacement hint, they are printed together + as a replacement (along with the text between them). */ + lc.add_hint (hint_1); + ASSERT_EQ (1, lc.m_corrections.length ()); + ASSERT_STREQ ("const_cast (", + lc.m_corrections[0]->m_text); + ASSERT_EQ (column_range (14, 22), lc.m_corrections[0]->m_affected_bytes); + ASSERT_EQ (column_range (12, 18), lc.m_corrections[0]->m_affected_columns); + ASSERT_EQ (column_range (12, 30), lc.m_corrections[0]->m_printed_columns); + + /* After the final insertion hint, they are all printed together + as a replacement (along with the text between them). */ + lc.add_hint (hint_2); + ASSERT_STREQ ("const_cast (ptr->field\xcf\x80)", + lc.m_corrections[0]->m_text); + ASSERT_EQ (1, lc.m_corrections.length ()); + ASSERT_EQ (column_range (14, 34), lc.m_corrections[0]->m_affected_bytes); + ASSERT_EQ (column_range (12, 29), lc.m_corrections[0]->m_affected_columns); + ASSERT_EQ (column_range (12, 42), lc.m_corrections[0]->m_printed_columns); + } + + /* Example where two are consolidated during printing. */ + { + test_diagnostic_context dc; + rich_location richloc (line_table, expr); + richloc.add_fixit_replace (open_paren, "CAST ("); + richloc.add_fixit_replace (close_paren, ") ("); + richloc.add_fixit_insert_after (")"); + + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " f\xf0\x9f\x98\x82" + " *f = (f\xf0\x9f\x98\x82" + " *)ptr->field\xcf\x80" + ";\n" + " ^~~~~~~~~~~\n" + " -\n" + " CAST (-\n" + " ) ( )\n", + pp_formatted_text (dc.printer)); + } + + /* Example where none are consolidated during printing. */ + { + test_diagnostic_context dc; + rich_location richloc (line_table, expr); + richloc.add_fixit_replace (open_paren, "CST ("); + richloc.add_fixit_replace (close_paren, ") ("); + richloc.add_fixit_insert_after (")"); + + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " f\xf0\x9f\x98\x82" + " *f = (f\xf0\x9f\x98\x82" + " *)ptr->field\xcf\x80" + ";\n" + " ^~~~~~~~~~~\n" + " -\n" + " CST ( -\n" + " ) ( )\n", + pp_formatted_text (dc.printer)); + } + + /* Example of deletion fix-it hints. */ + { + test_diagnostic_context dc; + rich_location richloc (line_table, expr); + richloc.add_fixit_insert_before (open_paren, "(bar\xf0\x9f\x98\x82 *)"); + source_range victim = {open_paren, close_paren}; + richloc.add_fixit_remove (victim); + + /* This case is actually handled by fixit-consolidation, + rather than by line_corrections. */ + ASSERT_EQ (1, richloc.get_num_fixit_hints ()); + + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " f\xf0\x9f\x98\x82" + " *f = (f\xf0\x9f\x98\x82" + " *)ptr->field\xcf\x80" + ";\n" + " ^~~~~~~~~~~\n" + " -------\n" + " (bar\xf0\x9f\x98\x82" + " *)\n", + pp_formatted_text (dc.printer)); + } + + /* Example of deletion fix-it hints that would overlap. */ + { + test_diagnostic_context dc; + rich_location richloc (line_table, expr); + richloc.add_fixit_insert_before (open_paren, "(long\xf0\x9f\x98\x82 *)"); + source_range victim = {expr_start, expr_finish}; + richloc.add_fixit_remove (victim); + + /* These fixits are not consolidated. */ + ASSERT_EQ (2, richloc.get_num_fixit_hints ()); + + /* But the corrections are. */ + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " f\xf0\x9f\x98\x82" + " *f = (f\xf0\x9f\x98\x82" + " *)ptr->field\xcf\x80" + ";\n" + " ^~~~~~~~~~~\n" + " ------------------\n" + " (long\xf0\x9f\x98\x82" + " *)(f\xf0\x9f\x98\x82" + " *)\n", + pp_formatted_text (dc.printer)); + } + + /* Example of insertion fix-it hints that would overlap. */ + { + test_diagnostic_context dc; + rich_location richloc (line_table, expr); + richloc.add_fixit_insert_before + (open_paren, "L\xf0\x9f\x98\x82NGER THAN THE CAST"); + richloc.add_fixit_insert_after (close_paren, "TEST"); + + /* The first insertion is long enough that if printed naively, + it would overlap with the second. + Verify that they are printed as a single replacement. */ + diagnostic_show_locus (&dc, &richloc, DK_ERROR); + ASSERT_STREQ ("\n" + " f\xf0\x9f\x98\x82" + " *f = (f\xf0\x9f\x98\x82" + " *)ptr->field\xcf\x80" + ";\n" + " ^~~~~~~~~~~\n" + " -------\n" + " L\xf0\x9f\x98\x82" + "NGER THAN THE CAST(f\xf0\x9f\x98\x82" + " *)TEST\n", + pp_formatted_text (dc.printer)); + } +} + /* Verify that the line_corrections machinery correctly prints overlapping fixit-hints that have been added in the wrong order. @@ -3407,10 +4330,10 @@ test_overlapped_fixit_printing_2 (const line_table_case &case_) /* These fixits should be accepted; they can't be consolidated. */ ASSERT_EQ (2, richloc.get_num_fixit_hints ()); const fixit_hint *hint_0 = richloc.get_fixit_hint (0); - ASSERT_EQ (column_range (23, 22), get_affected_columns (hint_0)); + ASSERT_EQ (column_range (23, 22), get_affected_range (hint_0, false)); ASSERT_EQ (column_range (23, 23), get_printed_columns (hint_0)); const fixit_hint *hint_1 = richloc.get_fixit_hint (1); - ASSERT_EQ (column_range (21, 20), get_affected_columns (hint_1)); + ASSERT_EQ (column_range (21, 20), get_affected_range (hint_1, false)); ASSERT_EQ (column_range (21, 21), get_printed_columns (hint_1)); /* Verify that they're printed correctly. */ @@ -3737,10 +4660,12 @@ diagnostic_show_locus_c_tests () test_diagnostic_show_locus_unknown_location (); for_each_line_table_case (test_diagnostic_show_locus_one_liner); + for_each_line_table_case (test_diagnostic_show_locus_one_liner_utf8); for_each_line_table_case (test_add_location_if_nearby); for_each_line_table_case (test_diagnostic_show_locus_fixit_lines); for_each_line_table_case (test_fixit_consolidation); for_each_line_table_case (test_overlapped_fixit_printing); + for_each_line_table_case (test_overlapped_fixit_printing_utf8); for_each_line_table_case (test_overlapped_fixit_printing_2); for_each_line_table_case (test_fixit_insert_containing_newline); for_each_line_table_case (test_fixit_insert_containing_newline_2); diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c index 96b6fa30052..8638fbebb2d 100644 --- a/gcc/diagnostic.c +++ b/gcc/diagnostic.c @@ -346,9 +346,13 @@ diagnostic_get_location_text (diagnostic_context *context, const char *locus_cs = colorize_start (pp_show_color (pp), "locus"); const char *locus_ce = colorize_stop (pp_show_color (pp)); const char *file = s.file ? s.file : progname; - int line = strcmp (file, N_("")) ? s.line : 0; - int col = context->show_column ? s.column : 0; - + int line = 0; + int col = 0; + if (strcmp (file, N_(""))) + { + line = s.line; + col = context->show_column ? location_compute_display_column (s) : 0; + } const char *line_col = maybe_line_and_column (line, col); return build_message_string ("%s%s%s:%s", locus_cs, file, line_col, locus_ce); diff --git a/gcc/input.c b/gcc/input.c index 00301ef68dd..d2d99000b84 100644 --- a/gcc/input.c +++ b/gcc/input.c @@ -908,6 +908,18 @@ make_location (location_t caret, source_range src_range) return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL); } +int +location_compute_display_column (expanded_location exploc) +{ + if (!(exploc.file && exploc.line && exploc.column)) + return exploc.column; + char_span line = location_get_source_line (exploc.file, exploc.line); + /* If line is NULL, this function returns exploc.column which is the + desired fallback. */ + return cpp_byte_column_to_display_column (line.get_buffer (), line.length (), + exploc.column); +} + /* Dump statistics to stderr about the memory usage of the line_table set of line maps. This also displays some statistics about macro expansion. */ @@ -3590,6 +3602,51 @@ test_line_offset_overflow () ASSERT_NE (ordmap_a, ordmap_b); } +void test_cpp_utf8 () +{ + /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */ + { + int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8); + ASSERT_EQ (8, w_bad); + int w_ctrl = cpp_display_width ("\r\t\n\v\0\1", 6); + ASSERT_EQ (6, w_ctrl); + } + + /* Verify that wcwidth of valid UTF-8 is as expected. */ + { + const int w_pi = cpp_display_width ("\xcf\x80", 2); + ASSERT_EQ (1, w_pi); + const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4); + ASSERT_EQ (2, w_emoji); + const int w_ascii = cpp_display_width ("GCC", 3); + ASSERT_EQ (3, w_ascii); + const int w_mixed + = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82 \x9f!", 17); + ASSERT_EQ (14, w_mixed); + } + + /* Verify that cpp_byte_column_to_display_column can go past the end, + and similar edge cases. */ + { + const char *str = "\xcf\x80 abc"; + ASSERT_EQ (5, cpp_display_width (str, 6)); + ASSERT_EQ (105, cpp_byte_column_to_display_column (str, 6, 106)); + ASSERT_EQ (10000, cpp_byte_column_to_display_column (NULL, 0, 10000)); + ASSERT_EQ (0, cpp_byte_column_to_display_column (NULL, 10000, 0)); + } + + /* Verify that cpp_display_column_to_byte_column can go past the end, + and similar edge cases. */ + { + const char *str = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello"; + ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2)); + ASSERT_EQ (15, cpp_display_column_to_byte_column (str, 15, 11)); + ASSERT_EQ (115, cpp_display_column_to_byte_column (str, 15, 111)); + ASSERT_EQ (10000, cpp_display_column_to_byte_column (NULL, 0, 10000)); + ASSERT_EQ (0, cpp_display_column_to_byte_column (NULL, 10000, 0)); + } +} + /* Run all of the selftests within this file. */ void @@ -3631,6 +3688,8 @@ input_c_tests () test_reading_source_line (); test_line_offset_overflow (); + + test_cpp_utf8 (); } } // namespace selftest diff --git a/gcc/input.h b/gcc/input.h index c459bf28553..35e02bd91d5 100644 --- a/gcc/input.h +++ b/gcc/input.h @@ -38,6 +38,7 @@ STATIC_ASSERT (BUILTINS_LOCATION < RESERVED_LOCATION_COUNT); extern bool is_location_from_builtin_token (location_t); extern expanded_location expand_location (location_t); +extern int location_compute_display_column (expanded_location); /* A class capturing the bounds of a buffer, to allow for run-time bounds-checking in a checked build. */ diff --git a/libcpp/charset.c b/libcpp/charset.c index 39af77a554a..d1bdff095eb 100644 --- a/libcpp/charset.c +++ b/libcpp/charset.c @@ -2257,3 +2257,106 @@ cpp_string_location_reader::get_next () m_loc += m_offset_per_column; return result; } + +/* Helper for cpp_byte_column_to_display_column and its inverse. Given a + pointer to a UTF-8-encoded character, compute its display width. *INBUFP + points on entry to the start of the UTF-8 encoding of the character, and + is updated to point just after the last byte of the encoding. *INBYTESLEFTP + contains on entry the remaining size of the buffer into which *INBUFP + points, and this is also updated accordingly. If *INBUFP does not + point to a valid UTF-8-encoded sequence, then it will be treated as a single + byte with display width 1. */ + +static inline int +compute_next_display_width (const uchar **inbufp, size_t *inbytesleftp) +{ + cppchar_t c; + if (one_utf8_to_cppchar (inbufp, inbytesleftp, &c) != 0) + { + /* Input is not convertible to UTF-8. This could be fine, e.g. in a + string literal, so don't complain. Just treat it as if it has a width + of one. */ + ++*inbufp; + --*inbytesleftp; + return 1; + } + + /* one_utf8_to_cppchar() has updated inbufp and inbytesleftp for us. */ + return cpp_wcwidth (c); +} + +/* For the string of length DATA_LENGTH bytes that begins at DATA, compute + how many display columns are occupied by the first COLUMN bytes. COLUMN + may exceed DATA_LENGTH, in which case the phantom bytes at the end are + treated as if they have display width 1. */ + +int +cpp_byte_column_to_display_column (const char *data, int data_length, + int column) +{ + int display_col = 0; + const uchar *udata = (const uchar *) data; + const int offset = MAX (0, column - data_length); + size_t inbytesleft = column - offset; + while (inbytesleft) + display_col += compute_next_display_width (&udata, &inbytesleft); + return display_col + offset; +} + +/* For the string of length DATA_LENGTH bytes that begins at DATA, compute + the least number of bytes that will result in at least DISPLAY_COL display + columns. The return value may exceed DATA_LENGTH if the entire string does + not occupy enough display columns. */ + +int +cpp_display_column_to_byte_column (const char *data, int data_length, + int display_col) +{ + int column = 0; + const uchar *udata = (const uchar *) data; + size_t inbytesleft = data_length; + while (column < display_col && inbytesleft) + column += compute_next_display_width (&udata, &inbytesleft); + return data_length - inbytesleft + (display_col - column); +} + +/* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc, + because that will inspect the user's locale, and in particular in an ASCII + locale, it will not return anything useful for extended characters. But GCC + in other respects (see e.g. _cpp_default_encoding()) behaves as if + everything is UTF-8. We also make some tweaks that are useful for the way + GCC needs to use this data, e.g. tabs and other control characters should be + treated as having width 1. The lookup tables are generated from + contrib/unicode/gen_wcwidth.py and were made by simply calling glibc + wcwidth() on all codepoints, then applying the small tweaks. These tables + are not highly optimized, but for the present purpose of outputting + diagnostics, they are sufficient. */ + +#include "generated_cpp_wcwidth.h" +int cpp_wcwidth (cppchar_t c) +{ + if (__builtin_expect (c <= wcwidth_range_ends[0], true)) + return wcwidth_widths[0]; + + /* Binary search the tables. */ + int begin = 1; + static const int end + = sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends); + int len = end - begin; + do + { + int half = len/2; + int middle = begin + half; + if (c > wcwidth_range_ends[middle]) + { + begin = middle + 1; + len -= half + 1; + } + else + len = half; + } while (len); + + if (__builtin_expect (begin != end, true)) + return wcwidth_widths[begin]; + return 1; +} diff --git a/libcpp/generated_cpp_wcwidth.h b/libcpp/generated_cpp_wcwidth.h new file mode 100644 index 00000000000..ec8b73d3d01 --- /dev/null +++ b/libcpp/generated_cpp_wcwidth.h @@ -0,0 +1,156 @@ +/* Generated by contrib/unicode/gen_wcwidth.py, with the help of glibc's + utf8_gen.py, using version 12.1.0 of the Unicode standard. */ + +static const cppchar_t wcwidth_range_ends[] = { + 0x2ff, 0x36f, 0x482, 0x489, 0x590, 0x5bd, 0x5be, 0x5bf, + 0x5c0, 0x5c2, 0x5c3, 0x5c5, 0x5c6, 0x5c7, 0x60f, 0x61a, + 0x61b, 0x61c, 0x64a, 0x65f, 0x66f, 0x670, 0x6d5, 0x6dc, + 0x6de, 0x6e4, 0x6e6, 0x6e8, 0x6e9, 0x6ed, 0x710, 0x711, + 0x72f, 0x74a, 0x7a5, 0x7b0, 0x7ea, 0x7f3, 0x7fc, 0x7fd, + 0x815, 0x819, 0x81a, 0x823, 0x824, 0x827, 0x828, 0x82d, + 0x858, 0x85b, 0x8d2, 0x8e1, 0x8e2, 0x902, 0x939, 0x93a, + 0x93b, 0x93c, 0x940, 0x948, 0x94c, 0x94d, 0x950, 0x957, + 0x961, 0x963, 0x980, 0x981, 0x9bb, 0x9bc, 0x9c0, 0x9c4, + 0x9cc, 0x9cd, 0x9e1, 0x9e3, 0x9fd, 0x9fe, 0xa00, 0xa02, + 0xa3b, 0xa3c, 0xa40, 0xa42, 0xa46, 0xa48, 0xa4a, 0xa4d, + 0xa50, 0xa51, 0xa6f, 0xa71, 0xa74, 0xa75, 0xa80, 0xa82, + 0xabb, 0xabc, 0xac0, 0xac5, 0xac6, 0xac8, 0xacc, 0xacd, + 0xae1, 0xae3, 0xaf9, 0xaff, 0xb00, 0xb01, 0xb3b, 0xb3c, + 0xb3e, 0xb3f, 0xb40, 0xb44, 0xb4c, 0xb4d, 0xb55, 0xb56, + 0xb61, 0xb63, 0xb81, 0xb82, 0xbbf, 0xbc0, 0xbcc, 0xbcd, + 0xbff, 0xc00, 0xc03, 0xc04, 0xc3d, 0xc40, 0xc45, 0xc48, + 0xc49, 0xc4d, 0xc54, 0xc56, 0xc61, 0xc63, 0xc80, 0xc81, + 0xcbb, 0xcbc, 0xcbe, 0xcbf, 0xcc5, 0xcc6, 0xccb, 0xccd, + 0xce1, 0xce3, 0xcff, 0xd01, 0xd3a, 0xd3c, 0xd40, 0xd44, + 0xd4c, 0xd4d, 0xd61, 0xd63, 0xdc9, 0xdca, 0xdd1, 0xdd4, + 0xdd5, 0xdd6, 0xe30, 0xe31, 0xe33, 0xe3a, 0xe46, 0xe4e, + 0xeb0, 0xeb1, 0xeb3, 0xebc, 0xec7, 0xecd, 0xf17, 0xf19, + 0xf34, 0xf35, 0xf36, 0xf37, 0xf38, 0xf39, 0xf70, 0xf7e, + 0xf7f, 0xf84, 0xf85, 0xf87, 0xf8c, 0xf97, 0xf98, 0xfbc, + 0xfc5, 0xfc6, 0x102c, 0x1030, 0x1031, 0x1037, 0x1038, 0x103a, + 0x103c, 0x103e, 0x1057, 0x1059, 0x105d, 0x1060, 0x1070, 0x1074, + 0x1081, 0x1082, 0x1084, 0x1086, 0x108c, 0x108d, 0x109c, 0x109d, + 0x10ff, 0x115f, 0x11ff, 0x135c, 0x135f, 0x1711, 0x1714, 0x1731, + 0x1734, 0x1751, 0x1753, 0x1771, 0x1773, 0x17b3, 0x17b5, 0x17b6, + 0x17bd, 0x17c5, 0x17c6, 0x17c8, 0x17d3, 0x17dc, 0x17dd, 0x180a, + 0x180e, 0x1884, 0x1886, 0x18a8, 0x18a9, 0x191f, 0x1922, 0x1926, + 0x1928, 0x1931, 0x1932, 0x1938, 0x193b, 0x1a16, 0x1a18, 0x1a1a, + 0x1a1b, 0x1a55, 0x1a56, 0x1a57, 0x1a5e, 0x1a5f, 0x1a60, 0x1a61, + 0x1a62, 0x1a64, 0x1a6c, 0x1a72, 0x1a7c, 0x1a7e, 0x1a7f, 0x1aaf, + 0x1abe, 0x1aff, 0x1b03, 0x1b33, 0x1b34, 0x1b35, 0x1b3a, 0x1b3b, + 0x1b3c, 0x1b41, 0x1b42, 0x1b6a, 0x1b73, 0x1b7f, 0x1b81, 0x1ba1, + 0x1ba5, 0x1ba7, 0x1ba9, 0x1baa, 0x1bad, 0x1be5, 0x1be6, 0x1be7, + 0x1be9, 0x1bec, 0x1bed, 0x1bee, 0x1bf1, 0x1c2b, 0x1c33, 0x1c35, + 0x1c37, 0x1ccf, 0x1cd2, 0x1cd3, 0x1ce0, 0x1ce1, 0x1ce8, 0x1cec, + 0x1ced, 0x1cf3, 0x1cf4, 0x1cf7, 0x1cf9, 0x1dbf, 0x1df9, 0x1dfa, + 0x1dff, 0x200a, 0x200f, 0x2029, 0x202e, 0x205f, 0x2064, 0x2065, + 0x206f, 0x20cf, 0x20f0, 0x2319, 0x231b, 0x2328, 0x232a, 0x23e8, + 0x23ec, 0x23ef, 0x23f0, 0x23f2, 0x23f3, 0x25fc, 0x25fe, 0x2613, + 0x2615, 0x2647, 0x2653, 0x267e, 0x267f, 0x2692, 0x2693, 0x26a0, + 0x26a1, 0x26a9, 0x26ab, 0x26bc, 0x26be, 0x26c3, 0x26c5, 0x26cd, + 0x26ce, 0x26d3, 0x26d4, 0x26e9, 0x26ea, 0x26f1, 0x26f3, 0x26f4, + 0x26f5, 0x26f9, 0x26fa, 0x26fc, 0x26fd, 0x2704, 0x2705, 0x2709, + 0x270b, 0x2727, 0x2728, 0x274b, 0x274c, 0x274d, 0x274e, 0x2752, + 0x2755, 0x2756, 0x2757, 0x2794, 0x2797, 0x27af, 0x27b0, 0x27be, + 0x27bf, 0x2b1a, 0x2b1c, 0x2b4f, 0x2b50, 0x2b54, 0x2b55, 0x2cee, + 0x2cf1, 0x2d7e, 0x2d7f, 0x2ddf, 0x2dff, 0x2e7f, 0x2e99, 0x2e9a, + 0x2ef3, 0x2eff, 0x2fd5, 0x2fef, 0x2ffb, 0x2fff, 0x3029, 0x302d, + 0x303e, 0x3040, 0x3096, 0x3098, 0x309a, 0x30ff, 0x3104, 0x312f, + 0x3130, 0x318e, 0x318f, 0x31ba, 0x31bf, 0x31e3, 0x31ef, 0x321e, + 0x321f, 0x4db5, 0x4dbf, 0x9fef, 0x9fff, 0xa48c, 0xa48f, 0xa4c6, + 0xa66e, 0xa672, 0xa673, 0xa67d, 0xa69d, 0xa69f, 0xa6ef, 0xa6f1, + 0xa801, 0xa802, 0xa805, 0xa806, 0xa80a, 0xa80b, 0xa824, 0xa826, + 0xa8c3, 0xa8c5, 0xa8df, 0xa8f1, 0xa8fe, 0xa8ff, 0xa925, 0xa92d, + 0xa946, 0xa951, 0xa95f, 0xa97c, 0xa97f, 0xa982, 0xa9b2, 0xa9b3, + 0xa9b5, 0xa9b9, 0xa9bb, 0xa9bd, 0xa9e4, 0xa9e5, 0xaa28, 0xaa2e, + 0xaa30, 0xaa32, 0xaa34, 0xaa36, 0xaa42, 0xaa43, 0xaa4b, 0xaa4c, + 0xaa7b, 0xaa7c, 0xaaaf, 0xaab0, 0xaab1, 0xaab4, 0xaab6, 0xaab8, + 0xaabd, 0xaabf, 0xaac0, 0xaac1, 0xaaeb, 0xaaed, 0xaaf5, 0xaaf6, + 0xabe4, 0xabe5, 0xabe7, 0xabe8, 0xabec, 0xabed, 0xabff, 0xd7a3, + 0xf8ff, 0xfa6d, 0xfa6f, 0xfad9, 0xfb1d, 0xfb1e, 0xfdff, 0xfe0f, + 0xfe19, 0xfe1f, 0xfe2f, 0xfe52, 0xfe53, 0xfe66, 0xfe67, 0xfe6b, + 0xfefe, 0xfeff, 0xff00, 0xff60, 0xffdf, 0xffe6, 0xfff8, 0xfffb, + 0x101fc, 0x101fd, 0x102df, 0x102e0, 0x10375, 0x1037a, 0x10a00, 0x10a03, + 0x10a04, 0x10a06, 0x10a0b, 0x10a0f, 0x10a37, 0x10a3a, 0x10a3e, 0x10a3f, + 0x10ae4, 0x10ae6, 0x10d23, 0x10d27, 0x10f45, 0x10f50, 0x11000, 0x11001, + 0x11037, 0x11046, 0x1107e, 0x11081, 0x110b2, 0x110b6, 0x110b8, 0x110ba, + 0x110ff, 0x11102, 0x11126, 0x1112b, 0x1112c, 0x11134, 0x11172, 0x11173, + 0x1117f, 0x11181, 0x111b5, 0x111be, 0x111c8, 0x111cc, 0x1122e, 0x11231, + 0x11233, 0x11234, 0x11235, 0x11237, 0x1123d, 0x1123e, 0x112de, 0x112df, + 0x112e2, 0x112ea, 0x112ff, 0x11301, 0x1133a, 0x1133c, 0x1133f, 0x11340, + 0x11365, 0x1136c, 0x1136f, 0x11374, 0x11437, 0x1143f, 0x11441, 0x11444, + 0x11445, 0x11446, 0x1145d, 0x1145e, 0x114b2, 0x114b8, 0x114b9, 0x114ba, + 0x114be, 0x114c0, 0x114c1, 0x114c3, 0x115b1, 0x115b5, 0x115bb, 0x115bd, + 0x115be, 0x115c0, 0x115db, 0x115dd, 0x11632, 0x1163a, 0x1163c, 0x1163d, + 0x1163e, 0x11640, 0x116aa, 0x116ab, 0x116ac, 0x116ad, 0x116af, 0x116b5, + 0x116b6, 0x116b7, 0x1171c, 0x1171f, 0x11721, 0x11725, 0x11726, 0x1172b, + 0x1182e, 0x11837, 0x11838, 0x1183a, 0x119d3, 0x119d7, 0x119d9, 0x119db, + 0x119df, 0x119e0, 0x11a00, 0x11a0a, 0x11a32, 0x11a38, 0x11a3a, 0x11a3e, + 0x11a46, 0x11a47, 0x11a50, 0x11a56, 0x11a58, 0x11a5b, 0x11a89, 0x11a96, + 0x11a97, 0x11a99, 0x11c2f, 0x11c36, 0x11c37, 0x11c3d, 0x11c3e, 0x11c3f, + 0x11c91, 0x11ca7, 0x11ca9, 0x11cb0, 0x11cb1, 0x11cb3, 0x11cb4, 0x11cb6, + 0x11d30, 0x11d36, 0x11d39, 0x11d3a, 0x11d3b, 0x11d3d, 0x11d3e, 0x11d45, + 0x11d46, 0x11d47, 0x11d8f, 0x11d91, 0x11d94, 0x11d95, 0x11d96, 0x11d97, + 0x11ef2, 0x11ef4, 0x1342f, 0x13438, 0x16aef, 0x16af4, 0x16b2f, 0x16b36, + 0x16f4e, 0x16f4f, 0x16f8e, 0x16f92, 0x16fdf, 0x16fe3, 0x16fff, 0x187f7, + 0x187ff, 0x18af2, 0x1afff, 0x1b11e, 0x1b14f, 0x1b152, 0x1b163, 0x1b167, + 0x1b16f, 0x1b2fb, 0x1bc9c, 0x1bc9e, 0x1bc9f, 0x1bca3, 0x1d166, 0x1d169, + 0x1d172, 0x1d182, 0x1d184, 0x1d18b, 0x1d1a9, 0x1d1ad, 0x1d241, 0x1d244, + 0x1d9ff, 0x1da36, 0x1da3a, 0x1da6c, 0x1da74, 0x1da75, 0x1da83, 0x1da84, + 0x1da9a, 0x1da9f, 0x1daa0, 0x1daaf, 0x1dfff, 0x1e006, 0x1e007, 0x1e018, + 0x1e01a, 0x1e021, 0x1e022, 0x1e024, 0x1e025, 0x1e02a, 0x1e12f, 0x1e136, + 0x1e2eb, 0x1e2ef, 0x1e8cf, 0x1e8d6, 0x1e943, 0x1e94a, 0x1f003, 0x1f004, + 0x1f0ce, 0x1f0cf, 0x1f18d, 0x1f18e, 0x1f190, 0x1f19a, 0x1f1ff, 0x1f202, + 0x1f20f, 0x1f23b, 0x1f23f, 0x1f248, 0x1f24f, 0x1f251, 0x1f25f, 0x1f265, + 0x1f2ff, 0x1f320, 0x1f32c, 0x1f335, 0x1f336, 0x1f37c, 0x1f37d, 0x1f393, + 0x1f39f, 0x1f3ca, 0x1f3ce, 0x1f3d3, 0x1f3df, 0x1f3f0, 0x1f3f3, 0x1f3f4, + 0x1f3f7, 0x1f43e, 0x1f43f, 0x1f440, 0x1f441, 0x1f4fc, 0x1f4fe, 0x1f53d, + 0x1f54a, 0x1f54e, 0x1f54f, 0x1f567, 0x1f579, 0x1f57a, 0x1f594, 0x1f596, + 0x1f5a3, 0x1f5a4, 0x1f5fa, 0x1f64f, 0x1f67f, 0x1f6c5, 0x1f6cb, 0x1f6cc, + 0x1f6cf, 0x1f6d2, 0x1f6d4, 0x1f6d5, 0x1f6ea, 0x1f6ec, 0x1f6f3, 0x1f6fa, + 0x1f7df, 0x1f7eb, 0x1f90c, 0x1f971, 0x1f972, 0x1f976, 0x1f979, 0x1f9a2, + 0x1f9a4, 0x1f9aa, 0x1f9ad, 0x1f9ca, 0x1f9cc, 0x1f9ff, 0x1fa6f, 0x1fa73, + 0x1fa77, 0x1fa7a, 0x1fa7f, 0x1fa82, 0x1fa8f, 0x1fa95, 0x1ffff, 0x2a6d6, + 0x2a6ff, 0x2b734, 0x2b73f, 0x2b81d, 0x2b81f, 0x2cea1, 0x2ceaf, 0x2ebe0, + 0x2f7ff, 0x2fa1d, 0xe0000, 0xe0001, 0xe001f, 0xe007f, 0xe00ff, 0xe01ef, +}; + +static const unsigned char wcwidth_widths[] = { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, + 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, + 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, + 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 0, 2, 1, 2, 1, 0, 2, 1, 2, + 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0, + 2, 1, 0, 2, 1, 2, 1, 2, 1, 0, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, + 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, + 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, + 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, + 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, +}; diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h index a645f8136a6..fdc8badba7d 100644 --- a/libcpp/include/cpplib.h +++ b/libcpp/include/cpplib.h @@ -1305,4 +1305,15 @@ extern bool cpp_userdef_char_p extern const char * cpp_get_userdef_suffix (const cpp_token *); +/* In charset.c */ +int cpp_byte_column_to_display_column (const char *data, int data_length, + int column); +inline int cpp_display_width (const char *data, int data_length) +{ + return cpp_byte_column_to_display_column (data, data_length, data_length); +} +int cpp_display_column_to_byte_column (const char *data, int data_length, + int display_col); +int cpp_wcwidth (cppchar_t c); + #endif /* ! LIBCPP_CPPLIB_H */