diff --git a/contrib/unicode/from_glibc/unicode_utils.py b/contrib/unicode/from_glibc/unicode_utils.py
new file mode 100644
index 00000000000..a9e94cce418
--- /dev/null
+++ b/contrib/unicode/from_glibc/unicode_utils.py
@@ -0,0 +1,527 @@
+# Utilities to generate Unicode data for glibc from upstream Unicode data.
+#
+# Copyright (C) 2014-2019 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# .
+
+'''
+This module contains utilities used by the scripts to generate
+Unicode data for glibc from upstream Unicode data files.
+'''
+
+import sys
+import re
+
+
+# Common locale header.
+COMMENT_HEADER = """
+% This file is part of the GNU C Library and contains locale data.
+% The Free Software Foundation does not claim any copyright interest
+% in the locale data contained in this file. The foregoing does not
+% affect the license of the GNU C Library as a whole. It does not
+% exempt you from the conditions of the license if your use would
+% otherwise be governed by that license.
+"""
+
+# Dictionary holding the entire contents of the UnicodeData.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {0: {'category': 'Cc',
+# 'title': None,
+# 'digit': '',
+# 'name': '',
+# 'bidi': 'BN',
+# 'combining': '0',
+# 'comment': '',
+# 'oldname': 'NULL',
+# 'decomposition': '',
+# 'upper': None,
+# 'mirrored': 'N',
+# 'lower': None,
+# 'decdigit': '',
+# 'numeric': ''},
+# â¦
+# }
+UNICODE_ATTRIBUTES = {}
+
+# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {917504: ['Default_Ignorable_Code_Point'],
+# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
+# â¦
+# }
+DERIVED_CORE_PROPERTIES = {}
+
+# Dictionary holding the entire contents of the EastAsianWidths.txt file
+#
+# Contents of this dictionary look like this:
+#
+# {0: 'N', ⦠, 45430: 'W', â¦}
+EAST_ASIAN_WIDTHS = {}
+
+def fill_attribute(code_point, fields):
+ '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
+
+ One entry in the UNICODE_ATTRIBUTES dictionary represents one line
+ in the UnicodeData.txt file.
+
+ '''
+ UNICODE_ATTRIBUTES[code_point] = {
+ 'name': fields[1], # Character name
+ 'category': fields[2], # General category
+ 'combining': fields[3], # Canonical combining classes
+ 'bidi': fields[4], # Bidirectional category
+ 'decomposition': fields[5], # Character decomposition mapping
+ 'decdigit': fields[6], # Decimal digit value
+ 'digit': fields[7], # Digit value
+ 'numeric': fields[8], # Numeric value
+ 'mirrored': fields[9], # mirrored
+ 'oldname': fields[10], # Old Unicode 1.0 name
+ 'comment': fields[11], # comment
+ # Uppercase mapping
+ 'upper': int(fields[12], 16) if fields[12] else None,
+ # Lowercase mapping
+ 'lower': int(fields[13], 16) if fields[13] else None,
+ # Titlecase mapping
+ 'title': int(fields[14], 16) if fields[14] else None,
+ }
+
+def fill_attributes(filename):
+ '''Stores the entire contents of the UnicodeData.txt file
+ in the UNICODE_ATTRIBUTES dictionary.
+
+ A typical line for a single code point in UnicodeData.txt looks
+ like this:
+
+ 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
+
+ Code point ranges are indicated by pairs of lines like this:
+
+ 4E00;;Lo;0;L;;;;;N;;;;;
+ 9FCC;;Lo;0;L;;;;;N;;;;;
+ '''
+ with open(filename, mode='r') as unicode_data_file:
+ fields_start = []
+ for line in unicode_data_file:
+ fields = line.strip().split(';')
+ if len(fields) != 15:
+ sys.stderr.write(
+ 'short line in file "%(f)s": %(l)s\n' %{
+ 'f': filename, 'l': line})
+ exit(1)
+ if fields[2] == 'Cs':
+ # Surrogates are UTF-16 artefacts,
+ # not real characters. Ignore them.
+ fields_start = []
+ continue
+ if fields[1].endswith(', First>'):
+ fields_start = fields
+ fields_start[1] = fields_start[1].split(',')[0][1:]
+ continue
+ if fields[1].endswith(', Last>'):
+ fields[1] = fields[1].split(',')[0][1:]
+ if fields[1:] != fields_start[1:]:
+ sys.stderr.write(
+ 'broken code point range in file "%(f)s": %(l)s\n' %{
+ 'f': filename, 'l': line})
+ exit(1)
+ for code_point in range(
+ int(fields_start[0], 16),
+ int(fields[0], 16)+1):
+ fill_attribute(code_point, fields)
+ fields_start = []
+ continue
+ fill_attribute(int(fields[0], 16), fields)
+ fields_start = []
+
+def fill_derived_core_properties(filename):
+ '''Stores the entire contents of the DerivedCoreProperties.txt file
+ in the DERIVED_CORE_PROPERTIES dictionary.
+
+ Lines in DerivedCoreProperties.txt are either a code point range like
+ this:
+
+ 0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
+
+ or a single code point like this:
+
+ 00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
+
+ '''
+ with open(filename, mode='r') as derived_core_properties_file:
+ for line in derived_core_properties_file:
+ match = re.match(
+ r'^(?P[0-9A-F]{4,6})'
+ + r'(?:\.\.(?P[0-9A-F]{4,6}))?'
+ + r'\s*;\s*(?P[a-zA-Z_]+)',
+ line)
+ if not match:
+ continue
+ start = match.group('codepoint1')
+ end = match.group('codepoint2')
+ if not end:
+ end = start
+ for code_point in range(int(start, 16), int(end, 16)+1):
+ prop = match.group('property')
+ if code_point in DERIVED_CORE_PROPERTIES:
+ DERIVED_CORE_PROPERTIES[code_point].append(prop)
+ else:
+ DERIVED_CORE_PROPERTIES[code_point] = [prop]
+
+def fill_east_asian_widths(filename):
+ '''Stores the entire contents of the EastAsianWidths.txt file
+ in the EAST_ASIAN_WIDTHS dictionary.
+
+ Lines in EastAsianWidths.txt are either a code point range like
+ this:
+
+ 9FCD..9FFF;W # Cn [51] ..
+
+ or a single code point like this:
+
+ A015;W # Lm YI SYLLABLE WU
+ '''
+ with open(filename, mode='r') as east_asian_widths_file:
+ for line in east_asian_widths_file:
+ match = re.match(
+ r'^(?P[0-9A-F]{4,6})'
+ +r'(?:\.\.(?P[0-9A-F]{4,6}))?'
+ +r'\s*;\s*(?P[a-zA-Z]+)',
+ line)
+ if not match:
+ continue
+ start = match.group('codepoint1')
+ end = match.group('codepoint2')
+ if not end:
+ end = start
+ for code_point in range(int(start, 16), int(end, 16)+1):
+ EAST_ASIAN_WIDTHS[code_point] = match.group('property')
+
+def to_upper(code_point):
+ '''Returns the code point of the uppercase version
+ of the given code point'''
+ if (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['upper']):
+ return UNICODE_ATTRIBUTES[code_point]['upper']
+ else:
+ return code_point
+
+def to_lower(code_point):
+ '''Returns the code point of the lowercase version
+ of the given code point'''
+ if (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['lower']):
+ return UNICODE_ATTRIBUTES[code_point]['lower']
+ else:
+ return code_point
+
+def to_upper_turkish(code_point):
+ '''Returns the code point of the Turkish uppercase version
+ of the given code point'''
+ if code_point == 0x0069:
+ return 0x0130
+ return to_upper(code_point)
+
+def to_lower_turkish(code_point):
+ '''Returns the code point of the Turkish lowercase version
+ of the given code point'''
+ if code_point == 0x0049:
+ return 0x0131
+ return to_lower(code_point)
+
+def to_title(code_point):
+ '''Returns the code point of the titlecase version
+ of the given code point'''
+ if (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['title']):
+ return UNICODE_ATTRIBUTES[code_point]['title']
+ else:
+ return code_point
+
+def is_upper(code_point):
+ '''Checks whether the character with this code point is uppercase'''
+ return (to_lower(code_point) != code_point
+ or (code_point in DERIVED_CORE_PROPERTIES
+ and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
+
+def is_lower(code_point):
+ '''Checks whether the character with this code point is lowercase'''
+ # Some characters are defined as âLowercaseâ in
+ # DerivedCoreProperties.txt but do not have a mapping to upper
+ # case. For example, ê° U+A72F âLATIN LETTER SMALL CAPITAL Fâ is
+ # one of these.
+ return (to_upper(code_point) != code_point
+ # is lowercase, but without simple to_upper mapping.
+ or code_point == 0x00DF
+ or (code_point in DERIVED_CORE_PROPERTIES
+ and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
+
+def is_alpha(code_point):
+ '''Checks whether the character with this code point is alphabetic'''
+ return ((code_point in DERIVED_CORE_PROPERTIES
+ and
+ 'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
+ or
+ # Consider all the non-ASCII digits as alphabetic.
+ # ISO C 99 forbids us to have them in category âdigitâ,
+ # but we want iswalnum to return true on them.
+ (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
+ and not (code_point >= 0x0030 and code_point <= 0x0039)))
+
+def is_digit(code_point):
+ '''Checks whether the character with this code point is a digit'''
+ if False:
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
+ # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
+ # a zero. Must add <0> in front of them by hand.
+ else:
+ # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
+ # takes it away:
+ # 7.25.2.1.5:
+ # The iswdigit function tests for any wide character that
+ # corresponds to a decimal-digit character (as defined in 5.2.1).
+ # 5.2.1:
+ # the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
+ return (code_point >= 0x0030 and code_point <= 0x0039)
+
+def is_outdigit(code_point):
+ '''Checks whether the character with this code point is outdigit'''
+ return (code_point >= 0x0030 and code_point <= 0x0039)
+
+def is_blank(code_point):
+ '''Checks whether the character with this code point is blank'''
+ return (code_point == 0x0009 # '\t'
+ # Category Zs without mention of ''
+ or (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
+ and '' not in
+ UNICODE_ATTRIBUTES[code_point]['decomposition']))
+
+def is_space(code_point):
+ '''Checks whether the character with this code point is a space'''
+ # Donât make U+00A0 a space. Non-breaking space means that all programs
+ # should treat it like a punctuation character, not like a space.
+ return (code_point == 0x0020 # ' '
+ or code_point == 0x000C # '\f'
+ or code_point == 0x000A # '\n'
+ or code_point == 0x000D # '\r'
+ or code_point == 0x0009 # '\t'
+ or code_point == 0x000B # '\v'
+ # Categories Zl, Zp, and Zs without mention of ""
+ or (UNICODE_ATTRIBUTES[code_point]['name']
+ and
+ (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
+ or
+ (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
+ and
+ '' not in
+ UNICODE_ATTRIBUTES[code_point]['decomposition']))))
+
+def is_cntrl(code_point):
+ '''Checks whether the character with this code point is
+ a control character'''
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and (UNICODE_ATTRIBUTES[code_point]['name'] == ''
+ or
+ UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
+
+def is_xdigit(code_point):
+ '''Checks whether the character with this code point is
+ a hexadecimal digit'''
+ if False:
+ return (is_digit(code_point)
+ or (code_point >= 0x0041 and code_point <= 0x0046)
+ or (code_point >= 0x0061 and code_point <= 0x0066))
+ else:
+ # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
+ # takes it away:
+ # 7.25.2.1.12:
+ # The iswxdigit function tests for any wide character that
+ # corresponds to a hexadecimal-digit character (as defined
+ # in 6.4.4.1).
+ # 6.4.4.1:
+ # hexadecimal-digit: one of
+ # 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
+ return ((code_point >= 0x0030 and code_point <= 0x0039)
+ or (code_point >= 0x0041 and code_point <= 0x0046)
+ or (code_point >= 0x0061 and code_point <= 0x0066))
+
+def is_graph(code_point):
+ '''Checks whether the character with this code point is
+ a graphical character'''
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['name'] != ''
+ and not is_space(code_point))
+
+def is_print(code_point):
+ '''Checks whether the character with this code point is printable'''
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['name'] != ''
+ and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
+
+def is_punct(code_point):
+ '''Checks whether the character with this code point is punctuation'''
+ if False:
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
+ else:
+ # The traditional POSIX definition of punctuation is every graphic,
+ # non-alphanumeric character.
+ return (is_graph(code_point)
+ and not is_alpha(code_point)
+ and not is_digit(code_point))
+
+def is_combining(code_point):
+ '''Checks whether the character with this code point is
+ a combining character'''
+ # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
+ # file. In 3.0.1 it was identical to the union of the general categories
+ # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
+ # PropList.txt file, so we take the latter definition.
+ return (UNICODE_ATTRIBUTES[code_point]['name']
+ and
+ UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
+
+def is_combining_level3(code_point):
+ '''Checks whether the character with this code point is
+ a combining level3 character'''
+ return (is_combining(code_point)
+ and
+ int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
+
+def ucs_symbol(code_point):
+ '''Return the UCS symbol string for a Unicode character.'''
+ if code_point < 0x10000:
+ return ''.format(code_point)
+ else:
+ return ''.format(code_point)
+
+def ucs_symbol_range(code_point_low, code_point_high):
+ '''Returns a string UCS symbol string for a code point range.
+
+ Example:
+
+ ..
+ '''
+ return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
+
+def verifications():
+ '''Tests whether the is_* functions observe the known restrictions'''
+ for code_point in sorted(UNICODE_ATTRIBUTES):
+ # toupper restriction: "Only characters specified for the keywords
+ # lower and upper shall be specified.
+ if (to_upper(code_point) != code_point
+ and not (is_lower(code_point) or is_upper(code_point))):
+ sys.stderr.write(
+ ('%(sym)s is not upper|lower '
+ + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
+ 'sym': ucs_symbol(code_point),
+ 'c': code_point,
+ 'uc': to_upper(code_point)})
+ # tolower restriction: "Only characters specified for the keywords
+ # lower and upper shall be specified.
+ if (to_lower(code_point) != code_point
+ and not (is_lower(code_point) or is_upper(code_point))):
+ sys.stderr.write(
+ ('%(sym)s is not upper|lower '
+ + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
+ 'sym': ucs_symbol(code_point),
+ 'c': code_point,
+ 'uc': to_lower(code_point)})
+ # alpha restriction: "Characters classified as either upper or lower
+ # shall automatically belong to this class.
+ if ((is_lower(code_point) or is_upper(code_point))
+ and not is_alpha(code_point)):
+ sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
+ 'sym': ucs_symbol(code_point)})
+ # alpha restriction: âNo character specified for the keywords cntrl,
+ # digit, punct or space shall be specified.â
+ if (is_alpha(code_point) and is_cntrl(code_point)):
+ sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_alpha(code_point) and is_digit(code_point)):
+ sys.stderr.write('%(sym)s is alpha and digit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_alpha(code_point) and is_punct(code_point)):
+ sys.stderr.write('%(sym)s is alpha and punct\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_alpha(code_point) and is_space(code_point)):
+ sys.stderr.write('%(sym)s is alpha and space\n' %{
+ 'sym': ucs_symbol(code_point)})
+ # space restriction: âNo character specified for the keywords upper,
+ # lower, alpha, digit, graph or xdigit shall be specified.â
+ # upper, lower, alpha already checked above.
+ if (is_space(code_point) and is_digit(code_point)):
+ sys.stderr.write('%(sym)s is space and digit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_space(code_point) and is_graph(code_point)):
+ sys.stderr.write('%(sym)s is space and graph\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_space(code_point) and is_xdigit(code_point)):
+ sys.stderr.write('%(sym)s is space and xdigit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ # cntrl restriction: âNo character specified for the keywords upper,
+ # lower, alpha, digit, punct, graph, print or xdigit shall be
+ # specified.â upper, lower, alpha already checked above.
+ if (is_cntrl(code_point) and is_digit(code_point)):
+ sys.stderr.write('%(sym)s is cntrl and digit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_cntrl(code_point) and is_punct(code_point)):
+ sys.stderr.write('%(sym)s is cntrl and punct\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_cntrl(code_point) and is_graph(code_point)):
+ sys.stderr.write('%(sym)s is cntrl and graph\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_cntrl(code_point) and is_print(code_point)):
+ sys.stderr.write('%(sym)s is cntrl and print\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_cntrl(code_point) and is_xdigit(code_point)):
+ sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ # punct restriction: âNo character specified for the keywords upper,
+ # lower, alpha, digit, cntrl, xdigit or as the character shall
+ # be specified.â upper, lower, alpha, cntrl already checked above.
+ if (is_punct(code_point) and is_digit(code_point)):
+ sys.stderr.write('%(sym)s is punct and digit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_punct(code_point) and is_xdigit(code_point)):
+ sys.stderr.write('%(sym)s is punct and xdigit\n' %{
+ 'sym': ucs_symbol(code_point)})
+ if (is_punct(code_point) and code_point == 0x0020):
+ sys.stderr.write('%(sym)s is punct\n' %{
+ 'sym': ucs_symbol(code_point)})
+ # graph restriction: âNo character specified for the keyword cntrl
+ # shall be specified.â Already checked above.
+
+ # print restriction: âNo character specified for the keyword cntrl
+ # shall be specified.â Already checked above.
+
+ # graph - print relation: differ only in the character.
+ # How is this possible if there are more than one space character?!
+ # I think susv2/xbd/locale.html should speak of âspace charactersâ,
+ # not âspace characterâ.
+ if (is_print(code_point)
+ and not (is_graph(code_point) or is_space(code_point))):
+ sys.stderr.write('%(sym)s is print but not graph|\n' %{
+ 'sym': unicode_utils.ucs_symbol(code_point)})
+ if (not is_print(code_point)
+ and (is_graph(code_point) or code_point == 0x0020)):
+ sys.stderr.write('%(sym)s is graph| but not print\n' %{
+ 'sym': unicode_utils.ucs_symbol(code_point)})
diff --git a/contrib/unicode/from_glibc/utf8_gen.py b/contrib/unicode/from_glibc/utf8_gen.py
new file mode 100755
index 00000000000..0e5583cd259
--- /dev/null
+++ b/contrib/unicode/from_glibc/utf8_gen.py
@@ -0,0 +1,364 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# Copyright (C) 2014-2019 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+#
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# .
+
+'''glibc/localedata/charmaps/UTF-8 file generator script
+
+This script generates a glibc/localedata/charmaps/UTF-8 file
+from Unicode data.
+
+Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
+
+It will output UTF-8 file
+'''
+
+import argparse
+import sys
+import re
+import unicode_utils
+
+# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
+# sections 3.11 and 4.4.
+
+JAMO_INITIAL_SHORT_NAME = (
+ 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
+ 'C', 'K', 'T', 'P', 'H'
+)
+
+JAMO_MEDIAL_SHORT_NAME = (
+ 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
+ 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
+)
+
+JAMO_FINAL_SHORT_NAME = (
+ '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
+ 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
+ 'P', 'H'
+)
+
+def process_range(start, end, outfile, name):
+ '''Writes a range of code points into the CHARMAP section of the
+ output file
+
+ '''
+ if 'Hangul Syllable' in name:
+ # from glibc/localedata/ChangeLog:
+ #
+ # 2000-09-24 Bruno Haible
+ # * charmaps/UTF-8: Expand and ranges,
+ # so they become printable and carry a width. Comment out surrogate
+ # ranges. Add a WIDTH table
+ #
+ # So we expand the Hangul Syllables here:
+ for i in range(int(start, 16), int(end, 16)+1 ):
+ index2, index3 = divmod(i - 0xaC00, 28)
+ index1, index2 = divmod(index2, 21)
+ hangul_syllable_name = 'HANGUL SYLLABLE ' \
+ + JAMO_INITIAL_SHORT_NAME[index1] \
+ + JAMO_MEDIAL_SHORT_NAME[index2] \
+ + JAMO_FINAL_SHORT_NAME[index3]
+ outfile.write('{:<11s} {:<12s} {:s}\n'.format(
+ unicode_utils.ucs_symbol(i), convert_to_hex(i),
+ hangul_syllable_name))
+ return
+ # UnicodeData.txt file has contains code point ranges like this:
+ #
+ # 3400;;Lo;0;L;;;;;N;;;;;
+ # 4DB5;;Lo;0;L;;;;;N;;;;;
+ #
+ # The glibc UTF-8 file splits ranges like these into shorter
+ # ranges of 64 code points each:
+ #
+ # .. /xe3/x90/x80
+ # â¦
+ # .. /xe4/xb6/x80
+ for i in range(int(start, 16), int(end, 16), 64 ):
+ if i > (int(end, 16)-64):
+ outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
+ unicode_utils.ucs_symbol(i),
+ unicode_utils.ucs_symbol(int(end,16)),
+ convert_to_hex(i),
+ name))
+ break
+ outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
+ unicode_utils.ucs_symbol(i),
+ unicode_utils.ucs_symbol(i+63),
+ convert_to_hex(i),
+ name))
+
+def process_charmap(flines, outfile):
+ '''This function takes an array which contains *all* lines of
+ of UnicodeData.txt and write lines to outfile as used in the
+
+ CHARMAP
+ â¦
+ END CHARMAP
+
+ section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
+
+ Samples for input lines:
+
+ 0010;;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
+ 3400;;Lo;0;L;;;;;N;;;;;
+ 4DB5;;Lo;0;L;;;;;N;;;;;
+ D800;;Cs;0;L;;;;;N;;;;;
+ DB7F;;Cs;0;L;;;;;N;;;;;
+ 100000;;Co;0;L;;;;;N;;;;;
+ 10FFFD;;Co;0;L;;;;;N;;;;;
+
+ Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
+
+ /x10 DATA LINK ESCAPE
+ .. /xe3/x90/x80
+ % /xed/xa0/x80
+ % /xed/xad/xbf
+ .. /xf4/x8f/xbf/x80
+
+ '''
+ fields_start = []
+ for line in flines:
+ fields = line.split(";")
+ # Some characters have ââ as their name. We try to
+ # use the âUnicode 1.0 Nameâ (10th field in
+ # UnicodeData.txt) for them.
+ #
+ # The Characters U+0080, U+0081, U+0084 and U+0099 have
+ # ââ as their name but do not even have aa
+ # âUnicode 1.0 Nameâ. We could write code to take their
+ # alternate names from NameAliases.txt.
+ if fields[1] == "" and fields[10]:
+ fields[1] = fields[10]
+ # Handling code point ranges like:
+ #
+ # 3400;;Lo;0;L;;;;;N;;;;;
+ # 4DB5;;Lo;0;L;;;;;N;;;;;
+ if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
+ fields_start = fields
+ continue
+ if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
+ process_range(fields_start[0], fields[0],
+ outfile, fields[1][:-7]+'>')
+ fields_start = []
+ continue
+ fields_start = []
+ if 'Surrogate,' in fields[1]:
+ # Comment out the surrogates in the UTF-8 file.
+ # One could of course skip them completely but
+ # the original UTF-8 file in glibc had them as
+ # comments, so we keep these comment lines.
+ outfile.write('%')
+ outfile.write('{:<11s} {:<12s} {:s}\n'.format(
+ unicode_utils.ucs_symbol(int(fields[0], 16)),
+ convert_to_hex(int(fields[0], 16)),
+ fields[1]))
+
+def convert_to_hex(code_point):
+ '''Converts a code point to a hexadecimal UTF-8 representation
+ like /x**/x**/x**.'''
+ # Getting UTF8 of Unicode characters.
+ # In Python3, .encode('UTF-8') does not work for
+ # surrogates. Therefore, we use this conversion table
+ surrogates = {
+ 0xD800: '/xed/xa0/x80',
+ 0xDB7F: '/xed/xad/xbf',
+ 0xDB80: '/xed/xae/x80',
+ 0xDBFF: '/xed/xaf/xbf',
+ 0xDC00: '/xed/xb0/x80',
+ 0xDFFF: '/xed/xbf/xbf',
+ }
+ if code_point in surrogates:
+ return surrogates[code_point]
+ return ''.join([
+ '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
+ ])
+
+def write_header_charmap(outfile):
+ '''Write the header on top of the CHARMAP section to the output file'''
+ outfile.write(" UTF-8\n")
+ outfile.write(" %\n")
+ outfile.write(" /\n")
+ outfile.write(" 1\n")
+ outfile.write(" 6\n\n")
+ outfile.write("% CHARMAP generated using utf8_gen.py\n")
+ outfile.write("% alias ISO-10646/UTF-8\n")
+ outfile.write("CHARMAP\n")
+
+def write_header_width(outfile, unicode_version):
+ '''Writes the header on top of the WIDTH section to the output file'''
+ outfile.write('% Character width according to Unicode '
+ + '{:s}.\n'.format(unicode_version))
+ outfile.write('% - Default width is 1.\n')
+ outfile.write('% - Double-width characters have width 2; generated from\n')
+ outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
+ outfile.write('% - Non-spacing characters have width 0; '
+ + 'generated from PropList.txt or\n')
+ outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
+ + 'UnicodeData.txt"\n')
+ outfile.write('% - Format control characters have width 0; '
+ + 'generated from\n')
+ outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
+# Not needed covered by Cf
+# outfile.write("% - Zero width characters have width 0; generated from\n")
+# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
+ outfile.write("WIDTH\n")
+
+def process_width(outfile, ulines, elines, plines):
+ '''ulines are lines from UnicodeData.txt, elines are lines from
+ EastAsianWidth.txt containing characters with width âWâ or âFâ,
+ plines are lines from PropList.txt which contain characters
+ with the property âPrepended_Concatenation_Markâ.
+
+ '''
+ width_dict = {}
+ for line in elines:
+ fields = line.split(";")
+ if not '..' in fields[0]:
+ code_points = (fields[0], fields[0])
+ else:
+ code_points = fields[0].split("..")
+ for key in range(int(code_points[0], 16),
+ int(code_points[1], 16)+1):
+ width_dict[key] = 2
+
+ for line in ulines:
+ fields = line.split(";")
+ if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
+ width_dict[int(fields[0], 16)] = 0
+
+ for line in plines:
+ # Characters with the property âPrepended_Concatenation_Markâ
+ # should have the width 1:
+ fields = line.split(";")
+ if not '..' in fields[0]:
+ code_points = (fields[0], fields[0])
+ else:
+ code_points = fields[0].split("..")
+ for key in range(int(code_points[0], 16),
+ int(code_points[1], 16)+1):
+ del width_dict[key] # default width is 1
+
+ # handle special cases for compatibility
+ for key in list((0x00AD,)):
+ # https://www.cs.tut.fi/~jkorpela/shy.html
+ if key in width_dict:
+ del width_dict[key] # default width is 1
+ for key in list(range(0x1160, 0x1200)):
+ width_dict[key] = 0
+ for key in list(range(0x3248, 0x3250)):
+ # These are âAâ which means we can decide whether to treat them
+ # as âWâ or âNâ based on context:
+ # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
+ # For us, âWâ seems better.
+ width_dict[key] = 2
+ for key in list(range(0x4DC0, 0x4E00)):
+ width_dict[key] = 2
+
+ same_width_lists = []
+ current_width_list = []
+ for key in sorted(width_dict):
+ if not current_width_list:
+ current_width_list = [key]
+ elif (key == current_width_list[-1] + 1
+ and width_dict[key] == width_dict[current_width_list[0]]):
+ current_width_list.append(key)
+ else:
+ same_width_lists.append(current_width_list)
+ current_width_list = [key]
+ if current_width_list:
+ same_width_lists.append(current_width_list)
+
+ for same_width_list in same_width_lists:
+ if len(same_width_list) == 1:
+ outfile.write('{:s}\t{:d}\n'.format(
+ unicode_utils.ucs_symbol(same_width_list[0]),
+ width_dict[same_width_list[0]]))
+ else:
+ outfile.write('{:s}...{:s}\t{:d}\n'.format(
+ unicode_utils.ucs_symbol(same_width_list[0]),
+ unicode_utils.ucs_symbol(same_width_list[-1]),
+ width_dict[same_width_list[0]]))
+
+if __name__ == "__main__":
+ PARSER = argparse.ArgumentParser(
+ description='''
+ Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
+ ''')
+ PARSER.add_argument(
+ '-u', '--unicode_data_file',
+ nargs='?',
+ type=str,
+ default='UnicodeData.txt',
+ help=('The UnicodeData.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
+ '-e', '--east_asian_with_file',
+ nargs='?',
+ type=str,
+ default='EastAsianWidth.txt',
+ help=('The EastAsianWidth.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
+ '-p', '--prop_list_file',
+ nargs='?',
+ type=str,
+ default='PropList.txt',
+ help=('The PropList.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
+ '--unicode_version',
+ nargs='?',
+ required=True,
+ type=str,
+ help='The Unicode version of the input files used.')
+ ARGS = PARSER.parse_args()
+
+ with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
+ UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
+ with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
+ EAST_ASIAN_WIDTH_LINES = []
+ for LINE in EAST_ASIAN_WIDTH_FILE:
+ # If characters from EastAasianWidth.txt which are from
+ # from reserved ranges (i.e. not yet assigned code points)
+ # are added to the WIDTH section of the UTF-8 file, then
+ # âmake checkâ produces âUnknown Characterâ errors for
+ # these code points because such unassigned code points
+ # are not in the CHARMAP section of the UTF-8 file.
+ #
+ # Therefore, we skip all reserved code points when reading
+ # the EastAsianWidth.txt file.
+ if re.match(r'.*\.\..*', LINE):
+ continue
+ if re.match(r'^[^;]*;[WF]', LINE):
+ EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
+ with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
+ PROP_LIST_LINES = []
+ for LINE in PROP_LIST_FILE:
+ if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
+ PROP_LIST_LINES.append(LINE.strip())
+ with open('UTF-8', mode='w') as OUTFILE:
+ # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
+ write_header_charmap(OUTFILE)
+ process_charmap(UNICODE_DATA_LINES, OUTFILE)
+ OUTFILE.write("END CHARMAP\n\n")
+ # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
+ write_header_width(OUTFILE, ARGS.unicode_version)
+ process_width(OUTFILE,
+ UNICODE_DATA_LINES,
+ EAST_ASIAN_WIDTH_LINES,
+ PROP_LIST_LINES)
+ OUTFILE.write("END WIDTH\n")
diff --git a/contrib/unicode/gen_wcwidth.py b/contrib/unicode/gen_wcwidth.py
new file mode 100755
index 00000000000..02b28bcedcf
--- /dev/null
+++ b/contrib/unicode/gen_wcwidth.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+#
+# Script to generate tables for cpp_wcwidth, leveraging glibc's utf8_gen.py.
+#
+# This file is part of GCC.
+#
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+#
+# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with GCC; see the file COPYING3. If not see
+# . */
+
+import sys
+import os
+
+if len(sys.argv) != 2:
+ print("usage: %s ", file=sys.stderr)
+ sys.exit(1)
+unicode_version = sys.argv[1]
+
+# Parse a codepoint in the format output by glibc tools.
+def parse_ucn(s):
+ if not (s.startswith("")):
+ raise ValueError
+ return int(s[2:-1], base=16)
+
+# Process a line of width output from utf_gen.py and update global array.
+widths = [1] * (1 + 0x10FFFF)
+def process_width(line):
+ # Example lines:
+ # 0
+ # ... 0
+
+ s = line.split()
+ width = int(s[1])
+ r = s[0].split("...")
+ if len(r) == 1:
+ begin = parse_ucn(r[0])
+ end = begin + 1
+ elif len(r) == 2:
+ begin = parse_ucn(r[0])
+ end = parse_ucn(r[1]) + 1
+ else:
+ raise ValueError
+ widths[begin:end] = [width] * (end - begin)
+
+# To keep things simple, we use glibc utf8_gen.py as-is. It only outputs to a
+# file named UTF-8, which is not configurable. Then we parse this into the form
+# we want it.
+os.system("from_glibc/utf8_gen.py --unicode_version %s" % unicode_version)
+processing = False
+for line in open("UTF-8", "r"):
+ if processing:
+ if line == "END WIDTH\n":
+ processing = False
+ else:
+ try:
+ process_width(line)
+ except (ValueError, IndexError):
+ print(e, "warning: ignored unexpected line: %s" % line,
+ file=sys.stderr, end="")
+ elif line == "WIDTH\n":
+ processing = True
+
+# All bytes < 256 we treat as width 1.
+widths[0:255] = [1] * 255
+
+# Condense the list to contiguous ranges.
+cur_range = [-1, 1]
+all_ranges = []
+for i, width in enumerate(widths):
+ if width == cur_range[1]:
+ cur_range[0] = i
+ else:
+ all_ranges.append(cur_range)
+ cur_range = [i, width]
+
+# Output the arrays for generated_cpp_wcwidth.h
+print("/* Generated by contrib/unicode/gen_wcwidth.py,",
+ "with the help of glibc's")
+print(" utf8_gen.py, using version %s" % unicode_version,
+ "of the Unicode standard. */")
+print("\nstatic const cppchar_t wcwidth_range_ends[] = {", end="")
+for i, r in enumerate(all_ranges):
+ if i % 8:
+ print(" ", end="")
+ else:
+ print("\n ", end="")
+ print("0x%x," % (r[0]), end="")
+print("\n};\n")
+print("static const unsigned char wcwidth_widths[] = {", end="")
+for i, r in enumerate(all_ranges):
+ if i % 24:
+ print(" ", end="")
+ else:
+ print("\n ", end="")
+ print("%d," % r[1], end="")
+print("\n};")
diff --git a/gcc/diagnostic-show-locus.c b/gcc/diagnostic-show-locus.c
index 4d563dda8f4..7a5bd36d962 100644
--- a/gcc/diagnostic-show-locus.c
+++ b/gcc/diagnostic-show-locus.c
@@ -30,6 +30,7 @@ along with GCC; see the file COPYING3. If not see
#include "gcc-rich-location.h"
#include "selftest.h"
#include "selftest-diagnostic.h"
+#include "cpplib.h"
#ifdef HAVE_TERMIOS_H
# include
@@ -112,7 +113,29 @@ class colorizer
const char *m_stop_color;
};
-/* A point within a layout_range; similar to an expanded_location,
+/* In order to handle multibyte sources properly, all of this logic needs to be
+ aware of the distinction between the number of bytes and the number of
+ display columns occupied by a character. One or the other is more useful
+ depending on the context. For instance, in order to output the caret at the
+ correct location, we need to count display columns; in order to colorize a
+ source line, we need to count the bytes. All locations are provided to us
+ as byte counts. We augment these with the display column so that it can be
+ used when need. This is not the most efficient way to do things since it
+ requires looping over the whole line each time, but it should be fine for
+ the purpose of outputting diagnostics. */
+
+class exploc_with_display_col : public expanded_location
+{
+ public:
+ exploc_with_display_col (const expanded_location &exploc)
+ : expanded_location (exploc),
+ m_display_col (location_compute_display_column (exploc)) {}
+
+ int m_display_col;
+};
+
+
+/* A point within a layout_range; similar to an exploc_with_display_col,
but after filtering on file. */
class layout_point
@@ -120,10 +143,17 @@ class layout_point
public:
layout_point (const expanded_location &exploc)
: m_line (exploc.line),
- m_column (exploc.column) {}
+ m_column (exploc.column),
+ m_display_col (location_compute_display_column (exploc)) {}
+
+ int get_col (bool use_display) const
+ {
+ return use_display ? m_display_col : m_column;
+ }
linenum_type m_line;
int m_column;
+ int m_display_col;
};
/* A class for use by "class layout" below: a filtered location_range. */
@@ -138,7 +168,7 @@ class layout_range
unsigned original_idx,
const range_label *label);
- bool contains_point (linenum_type row, int column) const;
+ bool contains_point (linenum_type row, int column, bool use_display) const;
bool intersects_line_p (linenum_type row) const;
layout_point m_start;
@@ -157,6 +187,17 @@ struct line_bounds
{
int m_first_non_ws;
int m_last_non_ws;
+
+ void convert_to_display_cols (char_span line)
+ {
+ m_first_non_ws = cpp_byte_column_to_display_column (line.get_buffer (),
+ line.length (),
+ m_first_non_ws);
+
+ m_last_non_ws = cpp_byte_column_to_display_column (line.get_buffer (),
+ line.length (),
+ m_last_non_ws);
+ }
};
/* A range of contiguous source lines within a layout (e.g. "lines 5-10"
@@ -284,6 +325,7 @@ class layout
get_state_at_point (/* Inputs. */
linenum_type row, int column,
int first_non_ws, int last_non_ws,
+ bool use_display,
/* Outputs. */
point_state *out_state);
@@ -298,7 +340,7 @@ class layout
diagnostic_context *m_context;
pretty_printer *m_pp;
location_t m_primary_loc;
- expanded_location m_exploc;
+ exploc_with_display_col m_exploc;
colorizer m_colorizer;
bool m_colorize_source_p;
bool m_show_labels_p;
@@ -472,10 +514,15 @@ layout_range::layout_range (const expanded_location *start_exploc,
- 'w' indicates a point within the range
- 'F' indicates the finish of the range (which is
within it).
- - 'a' indicates a subsequent point *after* the range. */
+ - 'a' indicates a subsequent point *after* the range.
+
+ USE_DISPLAY controls whether we check the byte column or
+ the display column; one or the other is more convenient
+ depending on the context. */
bool
-layout_range::contains_point (linenum_type row, int column) const
+layout_range::contains_point (linenum_type row, int column,
+ bool use_display) const
{
gcc_assert (m_start.m_line <= m_finish.m_line);
/* ...but the equivalent isn't true for the columns;
@@ -491,7 +538,7 @@ layout_range::contains_point (linenum_type row, int column) const
/* On same line as start of range (corresponding
to line 02 in example A and line 03 in example B). */
{
- if (column < m_start.m_column)
+ if (column < m_start.get_col (use_display))
/* Points on the starting line of the range, but
before the column in which it begins. */
return false;
@@ -505,7 +552,7 @@ layout_range::contains_point (linenum_type row, int column) const
{
/* This is a single-line range. */
gcc_assert (row == m_finish.m_line);
- return column <= m_finish.m_column;
+ return column <= m_finish.get_col (use_display);
}
}
@@ -530,7 +577,7 @@ layout_range::contains_point (linenum_type row, int column) const
gcc_assert (row == m_finish.m_line);
- return column <= m_finish.m_column;
+ return column <= m_finish.get_col (use_display);
}
/* Does this layout_range contain any part of line ROW? */
@@ -574,20 +621,23 @@ test_layout_range_for_single_point ()
/* Tests for layout_range::contains_point. */
- /* Before the line. */
- ASSERT_FALSE (point.contains_point (6, 1));
+ for (int use_display = 0; use_display <= 1; ++use_display)
+ {
+ /* Before the line. */
+ ASSERT_FALSE (point.contains_point (6, 1, use_display));
- /* On the line, but before start. */
- ASSERT_FALSE (point.contains_point (7, 9));
+ /* On the line, but before start. */
+ ASSERT_FALSE (point.contains_point (7, 9, use_display));
- /* At the point. */
- ASSERT_TRUE (point.contains_point (7, 10));
+ /* At the point. */
+ ASSERT_TRUE (point.contains_point (7, 10, use_display));
- /* On the line, after the point. */
- ASSERT_FALSE (point.contains_point (7, 11));
+ /* On the line, after the point. */
+ ASSERT_FALSE (point.contains_point (7, 11, use_display));
- /* After the line. */
- ASSERT_FALSE (point.contains_point (8, 1));
+ /* After the line. */
+ ASSERT_FALSE (point.contains_point (8, 1, use_display));
+ }
/* Tests for layout_range::intersects_line_p. */
ASSERT_FALSE (point.intersects_line_p (6));
@@ -605,26 +655,29 @@ test_layout_range_for_single_line ()
/* Tests for layout_range::contains_point. */
- /* Before the line. */
- ASSERT_FALSE (example_a.contains_point (1, 1));
+ for (int use_display = 0; use_display <= 1; ++use_display)
+ {
+ /* Before the line. */
+ ASSERT_FALSE (example_a.contains_point (1, 1, use_display));
- /* On the line, but before start. */
- ASSERT_FALSE (example_a.contains_point (2, 21));
+ /* On the line, but before start. */
+ ASSERT_FALSE (example_a.contains_point (2, 21, use_display));
- /* On the line, at the start. */
- ASSERT_TRUE (example_a.contains_point (2, 22));
+ /* On the line, at the start. */
+ ASSERT_TRUE (example_a.contains_point (2, 22, use_display));
- /* On the line, within the range. */
- ASSERT_TRUE (example_a.contains_point (2, 23));
+ /* On the line, within the range. */
+ ASSERT_TRUE (example_a.contains_point (2, 23, use_display));
- /* On the line, at the end. */
- ASSERT_TRUE (example_a.contains_point (2, 38));
+ /* On the line, at the end. */
+ ASSERT_TRUE (example_a.contains_point (2, 38, use_display));
- /* On the line, after the end. */
- ASSERT_FALSE (example_a.contains_point (2, 39));
+ /* On the line, after the end. */
+ ASSERT_FALSE (example_a.contains_point (2, 39, use_display));
- /* After the line. */
- ASSERT_FALSE (example_a.contains_point (2, 39));
+ /* After the line. */
+ ASSERT_FALSE (example_a.contains_point (2, 39, use_display));
+ }
/* Tests for layout_range::intersects_line_p. */
ASSERT_FALSE (example_a.intersects_line_p (1));
@@ -642,40 +695,43 @@ test_layout_range_for_multiple_lines ()
/* Tests for layout_range::contains_point. */
- /* Before first line. */
- ASSERT_FALSE (example_b.contains_point (1, 1));
+ for (int use_display = 0; use_display <= 1; ++use_display)
+ {
+ /* Before first line. */
+ ASSERT_FALSE (example_b.contains_point (1, 1, use_display));
- /* On the first line, but before start. */
- ASSERT_FALSE (example_b.contains_point (3, 13));
+ /* On the first line, but before start. */
+ ASSERT_FALSE (example_b.contains_point (3, 13, use_display));
- /* At the start. */
- ASSERT_TRUE (example_b.contains_point (3, 14));
+ /* At the start. */
+ ASSERT_TRUE (example_b.contains_point (3, 14, use_display));
- /* On the first line, within the range. */
- ASSERT_TRUE (example_b.contains_point (3, 15));
+ /* On the first line, within the range. */
+ ASSERT_TRUE (example_b.contains_point (3, 15, use_display));
- /* On an interior line.
- The column number should not matter; try various boundary
- values. */
- ASSERT_TRUE (example_b.contains_point (4, 1));
- ASSERT_TRUE (example_b.contains_point (4, 7));
- ASSERT_TRUE (example_b.contains_point (4, 8));
- ASSERT_TRUE (example_b.contains_point (4, 9));
- ASSERT_TRUE (example_b.contains_point (4, 13));
- ASSERT_TRUE (example_b.contains_point (4, 14));
- ASSERT_TRUE (example_b.contains_point (4, 15));
+ /* On an interior line.
+ The column number should not matter; try various boundary
+ values. */
+ ASSERT_TRUE (example_b.contains_point (4, 1, use_display));
+ ASSERT_TRUE (example_b.contains_point (4, 7, use_display));
+ ASSERT_TRUE (example_b.contains_point (4, 8, use_display));
+ ASSERT_TRUE (example_b.contains_point (4, 9, use_display));
+ ASSERT_TRUE (example_b.contains_point (4, 13, use_display));
+ ASSERT_TRUE (example_b.contains_point (4, 14, use_display));
+ ASSERT_TRUE (example_b.contains_point (4, 15, use_display));
- /* On the final line, before the end. */
- ASSERT_TRUE (example_b.contains_point (5, 7));
+ /* On the final line, before the end. */
+ ASSERT_TRUE (example_b.contains_point (5, 7, use_display));
- /* On the final line, at the end. */
- ASSERT_TRUE (example_b.contains_point (5, 8));
+ /* On the final line, at the end. */
+ ASSERT_TRUE (example_b.contains_point (5, 8, use_display));
- /* On the final line, after the end. */
- ASSERT_FALSE (example_b.contains_point (5, 9));
+ /* On the final line, after the end. */
+ ASSERT_FALSE (example_b.contains_point (5, 9, use_display));
- /* After the line. */
- ASSERT_FALSE (example_b.contains_point (6, 1));
+ /* After the line. */
+ ASSERT_FALSE (example_b.contains_point (6, 1, use_display));
+ }
/* Tests for layout_range::intersects_line_p. */
ASSERT_FALSE (example_b.intersects_line_p (2));
@@ -687,8 +743,8 @@ test_layout_range_for_multiple_lines ()
#endif /* #if CHECKING_P */
-/* Given a source line LINE of length LINE_WIDTH, determine the width
- without any trailing whitespace. */
+/* Given a source line LINE of length LINE_WIDTH bytes, determine the width
+ (in bytes, not display cols) without any trailing whitespace. */
static int
get_line_width_without_trailing_whitespace (const char *line, int line_width)
@@ -897,17 +953,35 @@ layout::layout (diagnostic_context * context,
will be adjusted accordingly. */
size_t max_width = m_context->caret_max_width;
char_span line = location_get_source_line (m_exploc.file, m_exploc.line);
- if (line && (size_t)m_exploc.column <= line.length ())
+ if (line && max_width)
{
- size_t right_margin = CARET_LINE_MARGIN;
- size_t column = m_exploc.column;
- if (m_show_line_numbers_p)
- column += m_linenum_width + 2;
- right_margin = MIN (line.length () - column, right_margin);
- right_margin = max_width - right_margin;
- if (line.length () >= max_width && column > right_margin)
- m_x_offset = column - right_margin;
- gcc_assert (m_x_offset >= 0);
+ size_t column = m_exploc.m_display_col;
+ int line_width
+ = get_line_width_without_trailing_whitespace (line.get_buffer (),
+ line.length ());
+ size_t eol = cpp_display_width (line.get_buffer (), line_width);
+ const size_t eol_before_linenum = eol;
+
+ if (column <= eol)
+ {
+ if (m_show_line_numbers_p)
+ {
+ column += m_linenum_width + 2;
+ eol += m_linenum_width + 2;
+ }
+ size_t right_margin = CARET_LINE_MARGIN;
+ right_margin = MIN (eol - column, right_margin);
+ right_margin = max_width - right_margin;
+ /* Note: if right_margin > max_width, we end up failing this next
+ check due to wrapping, and we don't offset anything. Otherwise we
+ would conclude we can't output the line at all. */
+ if (eol >= max_width && column > right_margin)
+ {
+ m_x_offset = column - right_margin;
+ m_x_offset = MIN (m_x_offset, (int) eol_before_linenum - 1);
+ }
+ gcc_assert (m_x_offset >= 0);
+ }
}
if (context->show_ruler_p)
@@ -1252,7 +1326,9 @@ layout::calculate_line_spans ()
/* Print line ROW of source code, potentially colorized at any ranges, and
populate *LBOUNDS_OUT.
LINE is the source line (not necessarily 0-terminated) and LINE_WIDTH
- is its width. */
+ is its width. This function deals only with byte offsets, not display
+ columns; m_x_offset must be converted from display to byte units. In
+ particular, LINE_WIDTH and LBOUNDS_OUT are in bytes. */
void
layout::print_source_line (linenum_type row, const char *line, int line_width,
@@ -1264,7 +1340,10 @@ layout::print_source_line (linenum_type row, const char *line, int line_width,
whitespace. */
line_width = get_line_width_without_trailing_whitespace (line,
line_width);
- line += m_x_offset;
+
+ const int x_offset_bytes
+ = cpp_display_column_to_byte_column (line, line_width, m_x_offset);
+ line += x_offset_bytes;
if (m_show_line_numbers_p)
{
@@ -1278,7 +1357,7 @@ layout::print_source_line (linenum_type row, const char *line, int line_width,
int first_non_ws = INT_MAX;
int last_non_ws = 0;
int column;
- for (column = 1 + m_x_offset; column <= line_width; column++)
+ for (column = 1 + x_offset_bytes; column <= line_width; column++)
{
/* Assuming colorization is enabled for the caret and underline
characters, we may also colorize the associated characters
@@ -1298,6 +1377,8 @@ layout::print_source_line (linenum_type row, const char *line, int line_width,
point_state state;
in_range_p = get_state_at_point (row, column,
0, INT_MAX,
+ false, /* Using bytes, not display
+ columns, here. */
&state);
if (in_range_p)
m_colorizer.set_range (state.range_idx);
@@ -1360,12 +1441,13 @@ layout::start_annotation_line (char margin_char) const
}
/* Print a line consisting of the caret/underlines for the given
- source line. */
+ source line. This function works with display columns, rather than byte
+ counts; in particular, LBOUNDS should be in display column units. */
void
layout::print_annotation_line (linenum_type row, const line_bounds lbounds)
{
- int x_bound = get_x_bound_for_row (row, m_exploc.column,
+ int x_bound = get_x_bound_for_row (row, m_exploc.m_display_col,
lbounds.m_last_non_ws);
start_annotation_line ();
@@ -1378,6 +1460,7 @@ layout::print_annotation_line (linenum_type row, const line_bounds lbounds)
in_range_p = get_state_at_point (row, column,
lbounds.m_first_non_ws,
lbounds.m_last_non_ws,
+ true, /* Using display units. */
&state);
if (in_range_p)
{
@@ -1415,9 +1498,11 @@ class line_label
public:
line_label (int state_idx, int column, label_text text)
: m_state_idx (state_idx), m_column (column),
- m_text (text), m_length (strlen (text.m_buffer)),
- m_label_line (0)
- {}
+ m_text (text), m_label_line (0)
+ {
+ const int bytes = strlen (text.m_buffer);
+ m_length = cpp_display_width (text.m_buffer, bytes);
+ }
/* Sorting is primarily by column, then by state index. */
static int comparator (const void *p1, const void *p2)
@@ -1459,7 +1544,7 @@ layout::print_any_labels (linenum_type row)
/* Reject labels that aren't fully visible due to clipping
by m_x_offset. */
- if (range->m_caret.m_column <= m_x_offset)
+ if (range->m_caret.m_display_col <= m_x_offset)
continue;
label_text text;
@@ -1471,7 +1556,7 @@ layout::print_any_labels (linenum_type row)
if (text.m_buffer == NULL)
continue;
- labels.safe_push (line_label (i, range->m_caret.m_column, text));
+ labels.safe_push (line_label (i, range->m_caret.m_display_col, text));
}
}
@@ -1624,7 +1709,7 @@ layout::print_leading_fixits (linenum_type row)
/* Subroutine of layout::print_trailing_fixits.
Determine if the annotation line printed for LINE contained
- the exact range from START_COLUMN to FINISH_COLUMN. */
+ the exact range from START_COLUMN to FINISH_COLUMN (in display units). */
bool
layout::annotation_line_showed_range_p (linenum_type line, int start_column,
@@ -1634,9 +1719,9 @@ layout::annotation_line_showed_range_p (linenum_type line, int start_column,
int i;
FOR_EACH_VEC_ELT (m_layout_ranges, i, range)
if (range->m_start.m_line == line
- && range->m_start.m_column == start_column
+ && range->m_start.m_display_col == start_column
&& range->m_finish.m_line == line
- && range->m_finish.m_column == finish_column)
+ && range->m_finish.m_display_col == finish_column)
return true;
return false;
}
@@ -1723,7 +1808,7 @@ layout::annotation_line_showed_range_p (linenum_type line, int start_column,
and is thus printed as desired. */
-/* A range of columns within a line. */
+/* A range of (byte or display) columns within a line. */
class column_range
{
@@ -1743,32 +1828,51 @@ public:
int finish;
};
-/* Get the range of columns that HINT would affect. */
-
+/* Get the range of bytes or display columns that HINT would affect. */
static column_range
-get_affected_columns (const fixit_hint *hint)
+get_affected_range (const fixit_hint *hint, bool use_display)
{
- int start_column = LOCATION_COLUMN (hint->get_start_loc ());
- int finish_column = LOCATION_COLUMN (hint->get_next_loc ()) - 1;
+ expanded_location exploc_start = expand_location (hint->get_start_loc ());
+ expanded_location exploc_finish = expand_location (hint->get_next_loc ());
+ --exploc_finish.column;
+ int start_column;
+ int finish_column;
+ if (use_display)
+ {
+ start_column = location_compute_display_column (exploc_start);
+ if (hint->insertion_p ())
+ finish_column = start_column - 1;
+ else
+ finish_column = location_compute_display_column (exploc_finish);
+ }
+ else
+ {
+ start_column = exploc_start.column;
+ finish_column = exploc_finish.column;
+ }
return column_range (start_column, finish_column);
}
-/* Get the range of columns that would be printed for HINT. */
+/* Get the range of display columns that would be printed for HINT. */
static column_range
get_printed_columns (const fixit_hint *hint)
{
- int start_column = LOCATION_COLUMN (hint->get_start_loc ());
- int final_hint_column = start_column + hint->get_length () - 1;
+ expanded_location exploc = expand_location (hint->get_start_loc ());
+ int start_column = location_compute_display_column (exploc);
+ int hint_width = cpp_display_width (hint->get_string (),
+ hint->get_length ());
+ int final_hint_column = start_column + hint_width - 1;
if (hint->insertion_p ())
{
return column_range (start_column, final_hint_column);
}
else
{
- int finish_column = LOCATION_COLUMN (hint->get_next_loc ()) - 1;
-
+ exploc = expand_location (hint->get_next_loc ());
+ --exploc.column;
+ int finish_column = location_compute_display_column (exploc);
return column_range (start_column,
MAX (finish_column, final_hint_column));
}
@@ -1782,27 +1886,35 @@ get_printed_columns (const fixit_hint *hint)
class correction
{
public:
- correction (column_range affected_columns,
+ correction (column_range affected_bytes,
+ column_range affected_columns,
column_range printed_columns,
const char *new_text, size_t new_text_len)
- : m_affected_columns (affected_columns),
+ : m_affected_bytes (affected_bytes),
+ m_affected_columns (affected_columns),
m_printed_columns (printed_columns),
m_text (xstrdup (new_text)),
- m_len (new_text_len),
+ m_bytes (new_text_len),
m_alloc_sz (new_text_len + 1)
{
+ compute_display_cols ();
}
~correction () { free (m_text); }
bool insertion_p () const
{
- return m_affected_columns.start == m_affected_columns.finish + 1;
+ return m_affected_bytes.start == m_affected_bytes.finish + 1;
}
void ensure_capacity (size_t len);
void ensure_terminated ();
+ void compute_display_cols ()
+ {
+ m_display_cols = cpp_display_width (m_text, m_bytes);
+ }
+
void overwrite (int dst_offset, const char_span &src_span)
{
gcc_assert (dst_offset >= 0);
@@ -1815,6 +1927,7 @@ public:
is to be inserted, and finish is offset by the length of
the replacement.
If replace, then the range of columns affected. */
+ column_range m_affected_bytes;
column_range m_affected_columns;
/* If insert, then start: the column before which the text
@@ -1825,7 +1938,8 @@ public:
/* The text to be inserted/used as replacement. */
char *m_text;
- size_t m_len;
+ size_t m_bytes;
+ int m_display_cols;
size_t m_alloc_sz;
};
@@ -1850,8 +1964,8 @@ void
correction::ensure_terminated ()
{
/* 0-terminate the buffer. */
- gcc_assert (m_len < m_alloc_sz);
- m_text[m_len] = '\0';
+ gcc_assert (m_bytes < m_alloc_sz);
+ m_text[m_bytes] = '\0';
}
/* A list of corrections affecting a particular line.
@@ -1913,7 +2027,8 @@ source_line::source_line (const char *filename, int line)
void
line_corrections::add_hint (const fixit_hint *hint)
{
- column_range affected_columns = get_affected_columns (hint);
+ column_range affected_bytes = get_affected_range (hint, false);
+ column_range affected_columns = get_affected_range (hint, true);
column_range printed_columns = get_printed_columns (hint);
/* Potentially consolidate. */
@@ -1924,8 +2039,8 @@ line_corrections::add_hint (const fixit_hint *hint)
/* The following consolidation code assumes that the fix-it hints
have been sorted by start (done within layout's ctor). */
- gcc_assert (affected_columns.start
- >= last_correction->m_affected_columns.start);
+ gcc_assert (affected_bytes.start
+ >= last_correction->m_affected_bytes.start);
gcc_assert (printed_columns.start
>= last_correction->m_printed_columns.start);
@@ -1937,8 +2052,8 @@ line_corrections::add_hint (const fixit_hint *hint)
Attempt to inject a "replace" correction from immediately
after the end of the last hint to immediately before the start
of the next hint. */
- column_range between (last_correction->m_affected_columns.finish + 1,
- printed_columns.start - 1);
+ column_range between (last_correction->m_affected_bytes.finish + 1,
+ affected_bytes.start - 1);
/* Try to read the source. */
source_line line (m_filename, m_row);
@@ -1947,7 +2062,7 @@ line_corrections::add_hint (const fixit_hint *hint)
/* Consolidate into the last correction:
add a no-op "replace" of the "between" text, and
add the text from the new hint. */
- int old_len = last_correction->m_len;
+ int old_len = last_correction->m_bytes;
gcc_assert (old_len >= 0);
int between_len = between.finish + 1 - between.start;
gcc_assert (between_len >= 0);
@@ -1961,19 +2076,24 @@ line_corrections::add_hint (const fixit_hint *hint)
last_correction->overwrite (old_len + between_len,
char_span (hint->get_string (),
hint->get_length ()));
- last_correction->m_len = new_len;
+ last_correction->m_bytes = new_len;
last_correction->ensure_terminated ();
+ last_correction->m_affected_bytes.finish
+ = affected_bytes.finish;
last_correction->m_affected_columns.finish
= affected_columns.finish;
+ int prev_display_cols = last_correction->m_display_cols;
+ last_correction->compute_display_cols ();
last_correction->m_printed_columns.finish
- += between_len + hint->get_length ();
+ += last_correction->m_display_cols - prev_display_cols;
return;
}
}
}
/* If no consolidation happened, add a new correction instance. */
- m_corrections.safe_push (new correction (affected_columns,
+ m_corrections.safe_push (new correction (affected_bytes,
+ affected_columns,
printed_columns,
hint->get_string (),
hint->get_length ()));
@@ -2022,7 +2142,7 @@ layout::print_trailing_fixits (linenum_type row)
m_colorizer.set_fixit_insert ();
pp_string (m_pp, c->m_text);
m_colorizer.set_normal_text ();
- column += c->m_len;
+ column += c->m_display_cols;
}
else
{
@@ -2034,7 +2154,7 @@ layout::print_trailing_fixits (linenum_type row)
int finish_column = c->m_affected_columns.finish;
if (!annotation_line_showed_range_p (row, start_column,
finish_column)
- || c->m_len == 0)
+ || c->m_bytes == 0)
{
move_to_column (&column, start_column, true);
m_colorizer.set_fixit_delete ();
@@ -2045,13 +2165,13 @@ layout::print_trailing_fixits (linenum_type row)
/* Print the replacement text. REPLACE also covers
removals, so only do this extra work (potentially starting
a new line) if we have actual replacement text. */
- if (c->m_len > 0)
+ if (c->m_bytes > 0)
{
move_to_column (&column, start_column, true);
m_colorizer.set_fixit_insert ();
pp_string (m_pp, c->m_text);
m_colorizer.set_normal_text ();
- column += c->m_len;
+ column += c->m_display_cols;
}
}
}
@@ -2072,12 +2192,14 @@ layout::print_newline ()
/* Return true if (ROW/COLUMN) is within a range of the layout.
If it returns true, OUT_STATE is written to, with the
range index, and whether we should draw the caret at
- (ROW/COLUMN) (as opposed to an underline). */
+ (ROW/COLUMN) (as opposed to an underline). USE_DISPLAY controls
+ whether all inputs and outputs are in bytes or display column units. */
bool
layout::get_state_at_point (/* Inputs. */
linenum_type row, int column,
int first_non_ws, int last_non_ws,
+ bool use_display,
/* Outputs. */
point_state *out_state)
{
@@ -2090,7 +2212,7 @@ layout::get_state_at_point (/* Inputs. */
source colorization. */
continue;
- if (range->contains_point (row, column))
+ if (range->contains_point (row, column, use_display))
{
out_state->range_idx = i;
@@ -2098,7 +2220,7 @@ layout::get_state_at_point (/* Inputs. */
out_state->draw_caret_p = false;
if (range->m_range_display_kind == SHOW_RANGE_WITH_CARET
&& row == range->m_caret.m_line
- && column == range->m_caret.m_column)
+ && column == range->m_caret.get_col (use_display))
out_state->draw_caret_p = true;
/* Within a multiline range, don't display any underline
@@ -2118,11 +2240,11 @@ layout::get_state_at_point (/* Inputs. */
/* Helper function for use by layout::print_line when printing the
annotation line under the source line.
- Get the column beyond the rightmost one that could contain a caret or
- range marker, given that we stop rendering at trailing whitespace.
+ Get the display column beyond the rightmost one that could contain a caret
+ or range marker, given that we stop rendering at trailing whitespace.
ROW is the source line within the given file.
- CARET_COLUMN is the column of range 0's caret.
- LAST_NON_WS_COLUMN is the last column containing a non-whitespace
+ CARET_COLUMN is the display column of range 0's caret.
+ LAST_NON_WS_COLUMN is the last display column containing a non-whitespace
character of source (as determined when printing the source line). */
int
@@ -2141,8 +2263,8 @@ layout::get_x_bound_for_row (linenum_type row, int caret_column,
{
/* On the final line within a range; ensure that
we render up to the end of the range. */
- if (result <= range->m_finish.m_column)
- result = range->m_finish.m_column + 1;
+ if (result <= range->m_finish.m_display_col)
+ result = range->m_finish.m_display_col + 1;
}
else if (row < range->m_finish.m_line)
{
@@ -2233,7 +2355,11 @@ layout::print_line (linenum_type row)
print_leading_fixits (row);
print_source_line (row, line.get_buffer (), line.length (), &lbounds);
if (should_print_annotation_line_p (row))
- print_annotation_line (row, lbounds);
+ {
+ if (lbounds.m_first_non_ws != INT_MAX)
+ lbounds.convert_to_display_cols (line);
+ print_annotation_line (row, lbounds);
+ }
if (m_show_labels_p)
print_any_labels (row);
print_trailing_fixits (row);
@@ -2846,6 +2972,560 @@ test_diagnostic_show_locus_one_liner (const line_table_case &case_)
test_one_liner_labels ();
}
+/* Version of all one-liner tests exercising multibyte awareness. For
+ simplicity we stick to using two multibyte characters in the test, U+1F602
+ == "\xf0\x9f\x98\x82", which uses 4 bytes and 2 display columns, and U+03C0
+ == "\xcf\x80", which uses 2 bytes and 1 display column. Note: all of the
+ below asserts would be easier to read if we used UTF-8 directly in the
+ string constants, but it seems better not to demand the host compiler
+ support this, when it isn't otherwise necessary. Instead, whenever an
+ extended character appears in a string, we put a line break after it so that
+ all succeeding characters can appear visually at the correct display column.
+
+ All of these work on the following 1-line source file:
+
+ .0000000001111111111222222 display
+ .1234567890123456789012345 columns
+ "SS_foo = P_bar.SS_fieldP;\n"
+ .0000000111111111222222223 byte
+ .1356789012456789134567891 columns
+
+ which is set up by test_diagnostic_show_locus_one_liner and calls
+ them. Here SS represents the two display columns for the U+1F602 emoji and
+ P represents the one display column for the U+03C0 pi symbol. */
+
+/* Just a caret. */
+
+static void
+test_one_liner_simple_caret_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t caret = linemap_position_for_column (line_table, 18);
+ rich_location richloc (line_table, caret);
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Caret and range. */
+static void
+test_one_liner_caret_and_range_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t caret = linemap_position_for_column (line_table, 18);
+ location_t start = linemap_position_for_column (line_table, 12);
+ location_t finish = linemap_position_for_column (line_table, 30);
+ location_t loc = make_location (caret, start, finish);
+ rich_location richloc (line_table, loc);
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ~~~~~^~~~~~~~~~\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Multiple ranges and carets. */
+
+static void
+test_one_liner_multiple_carets_and_ranges_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t foo
+ = make_location (linemap_position_for_column (line_table, 7),
+ linemap_position_for_column (line_table, 1),
+ linemap_position_for_column (line_table, 8));
+ dc.caret_chars[0] = 'A';
+
+ location_t bar
+ = make_location (linemap_position_for_column (line_table, 16),
+ linemap_position_for_column (line_table, 12),
+ linemap_position_for_column (line_table, 17));
+ dc.caret_chars[1] = 'B';
+
+ location_t field
+ = make_location (linemap_position_for_column (line_table, 26),
+ linemap_position_for_column (line_table, 19),
+ linemap_position_for_column (line_table, 30));
+ dc.caret_chars[2] = 'C';
+ rich_location richloc (line_table, foo);
+ richloc.add_range (bar, SHOW_RANGE_WITH_CARET);
+ richloc.add_range (field, SHOW_RANGE_WITH_CARET);
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ~~~~A~ ~~~B~ ~~~~~C~~~\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Insertion fix-it hint: adding an "&" to the front of "P_bar.field". */
+
+static void
+test_one_liner_fixit_insert_before_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t caret = linemap_position_for_column (line_table, 12);
+ rich_location richloc (line_table, caret);
+ richloc.add_fixit_insert_before ("&");
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^\n"
+ " &\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Insertion fix-it hint: adding a "[0]" after "SS_foo". */
+
+static void
+test_one_liner_fixit_insert_after_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t start = linemap_position_for_column (line_table, 1);
+ location_t finish = linemap_position_for_column (line_table, 8);
+ location_t foo = make_location (start, start, finish);
+ rich_location richloc (line_table, foo);
+ richloc.add_fixit_insert_after ("[0]");
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^~~~~~\n"
+ " [0]\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Removal fix-it hint: removal of the ".SS_fieldP". */
+
+static void
+test_one_liner_fixit_remove_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t start = linemap_position_for_column (line_table, 18);
+ location_t finish = linemap_position_for_column (line_table, 30);
+ location_t dot = make_location (start, start, finish);
+ rich_location richloc (line_table, dot);
+ richloc.add_fixit_remove ();
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~~\n"
+ " ----------\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Replace fix-it hint: replacing "SS_fieldP" with "m_SSfieldP". */
+
+static void
+test_one_liner_fixit_replace_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t start = linemap_position_for_column (line_table, 19);
+ location_t finish = linemap_position_for_column (line_table, 30);
+ location_t field = make_location (start, start, finish);
+ rich_location richloc (line_table, field);
+ richloc.add_fixit_replace ("m_\xf0\x9f\x98\x82_field\xcf\x80");
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~\n"
+ " m_\xf0\x9f\x98\x82"
+ "_field\xcf\x80\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Replace fix-it hint: replacing "SS_fieldP" with "m_SSfieldP",
+ but where the caret was elsewhere. */
+
+static void
+test_one_liner_fixit_replace_non_equal_range_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t equals = linemap_position_for_column (line_table, 10);
+ location_t start = linemap_position_for_column (line_table, 19);
+ location_t finish = linemap_position_for_column (line_table, 30);
+ rich_location richloc (line_table, equals);
+ source_range range;
+ range.m_start = start;
+ range.m_finish = finish;
+ richloc.add_fixit_replace (range, "m_\xf0\x9f\x98\x82_field\xcf\x80");
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ /* The replacement range is not indicated in the annotation line, so
+ it should be indicated via an additional underline. */
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^\n"
+ " ---------\n"
+ " m_\xf0\x9f\x98\x82"
+ "_field\xcf\x80\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Replace fix-it hint: replacing "SS_fieldP" with "m_SSfieldP",
+ where the caret was elsewhere, but where a secondary range
+ exactly covers "field". */
+
+static void
+test_one_liner_fixit_replace_equal_secondary_range_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t equals = linemap_position_for_column (line_table, 10);
+ location_t start = linemap_position_for_column (line_table, 19);
+ location_t finish = linemap_position_for_column (line_table, 30);
+ rich_location richloc (line_table, equals);
+ location_t field = make_location (start, start, finish);
+ richloc.add_range (field);
+ richloc.add_fixit_replace (field, "m_\xf0\x9f\x98\x82_field\xcf\x80");
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ /* The replacement range is indicated in the annotation line,
+ so it shouldn't be indicated via an additional underline. */
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^ ~~~~~~~~~\n"
+ " m_\xf0\x9f\x98\x82"
+ "_field\xcf\x80\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Verify that we can use ad-hoc locations when adding fixits to a
+ rich_location. */
+
+static void
+test_one_liner_fixit_validation_adhoc_locations_utf8 ()
+{
+ /* Generate a range that's too long to be packed, so must
+ be stored as an ad-hoc location (given the defaults
+ of 5 bits or 0 bits of packed range); 41 columns > 2**5. */
+ const location_t c12 = linemap_position_for_column (line_table, 12);
+ const location_t c52 = linemap_position_for_column (line_table, 52);
+ const location_t loc = make_location (c12, c12, c52);
+
+ if (c52 > LINE_MAP_MAX_LOCATION_WITH_COLS)
+ return;
+
+ ASSERT_TRUE (IS_ADHOC_LOC (loc));
+
+ /* Insert. */
+ {
+ rich_location richloc (line_table, loc);
+ richloc.add_fixit_insert_before (loc, "test");
+ /* It should not have been discarded by the validator. */
+ ASSERT_EQ (1, richloc.get_num_fixit_hints ());
+
+ test_diagnostic_context dc;
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~~~~~~~~ \n"
+ " test\n",
+ pp_formatted_text (dc.printer));
+ }
+
+ /* Remove. */
+ {
+ rich_location richloc (line_table, loc);
+ source_range range = source_range::from_locations (loc, c52);
+ richloc.add_fixit_remove (range);
+ /* It should not have been discarded by the validator. */
+ ASSERT_EQ (1, richloc.get_num_fixit_hints ());
+
+ test_diagnostic_context dc;
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~~~~~~~~ \n"
+ " -------------------------------------\n",
+ pp_formatted_text (dc.printer));
+ }
+
+ /* Replace. */
+ {
+ rich_location richloc (line_table, loc);
+ source_range range = source_range::from_locations (loc, c52);
+ richloc.add_fixit_replace (range, "test");
+ /* It should not have been discarded by the validator. */
+ ASSERT_EQ (1, richloc.get_num_fixit_hints ());
+
+ test_diagnostic_context dc;
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~~~~~~~~ \n"
+ " test\n",
+ pp_formatted_text (dc.printer));
+ }
+}
+
+/* Test of consolidating insertions at the same location. */
+
+static void
+test_one_liner_many_fixits_1_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t equals = linemap_position_for_column (line_table, 10);
+ rich_location richloc (line_table, equals);
+ for (int i = 0; i < 19; i++)
+ richloc.add_fixit_insert_before (i & 1 ? "@" : "\xcf\x80");
+ ASSERT_EQ (1, richloc.get_num_fixit_hints ());
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^\n"
+ " \xcf\x80@\xcf\x80@\xcf\x80@\xcf\x80@\xcf\x80@"
+ "\xcf\x80@\xcf\x80@\xcf\x80@\xcf\x80@\xcf\x80\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Ensure that we can add an arbitrary number of fix-it hints to a
+ rich_location, even if they are not consolidated. */
+
+static void
+test_one_liner_many_fixits_2_utf8 ()
+{
+ test_diagnostic_context dc;
+ location_t equals = linemap_position_for_column (line_table, 10);
+ rich_location richloc (line_table, equals);
+ const int nlocs = 19;
+ int locs[nlocs] = {1, 5, 7, 9, 11, 14, 16, 18, 23, 25, 27, 29, 32,
+ 34, 36, 38, 40, 42, 44};
+ for (int i = 0; i != nlocs; ++i)
+ {
+ location_t loc = linemap_position_for_column (line_table, locs[i]);
+ richloc.add_fixit_insert_before (loc, i & 1 ? "@" : "\xcf\x80");
+ }
+
+ ASSERT_EQ (nlocs, richloc.get_num_fixit_hints ());
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^\n"
+ " \xcf\x80 @ \xcf\x80 @ \xcf\x80 @ \xcf\x80 @ \xcf\x80 @"
+ " \xcf\x80 @ \xcf\x80 @ \xcf\x80 @ \xcf\x80 @ \xcf\x80\n",
+ pp_formatted_text (dc.printer));
+}
+
+/* Test of labeling the ranges within a rich_location. */
+
+static void
+test_one_liner_labels_utf8 ()
+{
+ location_t foo
+ = make_location (linemap_position_for_column (line_table, 1),
+ linemap_position_for_column (line_table, 1),
+ linemap_position_for_column (line_table, 8));
+ location_t bar
+ = make_location (linemap_position_for_column (line_table, 12),
+ linemap_position_for_column (line_table, 12),
+ linemap_position_for_column (line_table, 17));
+ location_t field
+ = make_location (linemap_position_for_column (line_table, 19),
+ linemap_position_for_column (line_table, 19),
+ linemap_position_for_column (line_table, 30));
+
+ /* Example where all the labels fit on one line. */
+ {
+ text_range_label label0
+ ("\xcf\x80\xcf\x80\xcf\x80\xcf\x80\xcf\x80\xcf\x80");
+ text_range_label label1
+ ("\xf0\x9f\x98\x82\xf0\x9f\x98\x82\xcf\x80");
+ text_range_label label2
+ ("\xf0\x9f\x98\x82\xcf\x80\xf0\x9f\x98\x82\xf0\x9f\x98\x82\xcf\x80"
+ "\xcf\x80");
+ gcc_rich_location richloc (foo, &label0);
+ richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1);
+ richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2);
+
+ {
+ test_diagnostic_context dc;
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^~~~~~ ~~~~~ ~~~~~~~~~\n"
+ " | | |\n"
+ " \xcf\x80\xcf\x80\xcf\x80\xcf\x80\xcf\x80\xcf\x80"
+ " \xf0\x9f\x98\x82\xf0\x9f\x98\x82\xcf\x80"
+ " \xf0\x9f\x98\x82\xcf\x80\xf0\x9f\x98\x82"
+ "\xf0\x9f\x98\x82\xcf\x80\xcf\x80\n",
+ pp_formatted_text (dc.printer));
+ }
+
+ /* Verify that we can disable label-printing. */
+ {
+ test_diagnostic_context dc;
+ dc.show_labels_p = false;
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^~~~~~ ~~~~~ ~~~~~~~~~\n",
+ pp_formatted_text (dc.printer));
+ }
+ }
+
+ /* Example where the labels need extra lines. */
+ {
+ text_range_label label0 ("label 0\xf0\x9f\x98\x82");
+ text_range_label label1 ("label 1\xcf\x80");
+ text_range_label label2 ("label 2\xcf\x80");
+ gcc_rich_location richloc (foo, &label0);
+ richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1);
+ richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2);
+
+ test_diagnostic_context dc;
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^~~~~~ ~~~~~ ~~~~~~~~~\n"
+ " | | |\n"
+ " | | label 2\xcf\x80\n"
+ " | label 1\xcf\x80\n"
+ " label 0\xf0\x9f\x98\x82\n",
+ pp_formatted_text (dc.printer));
+ }
+
+ /* Example of boundary conditions: label 0 and 1 have just enough clearance,
+ but label 1 just touches label 2. */
+ {
+ text_range_label label0 ("aaaaa\xf0\x9f\x98\x82\xcf\x80");
+ text_range_label label1 ("bb\xf0\x9f\x98\x82\xf0\x9f\x98\x82");
+ text_range_label label2 ("c");
+ gcc_rich_location richloc (foo, &label0);
+ richloc.add_range (bar, SHOW_RANGE_WITHOUT_CARET, &label1);
+ richloc.add_range (field, SHOW_RANGE_WITHOUT_CARET, &label2);
+
+ test_diagnostic_context dc;
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " \xf0\x9f\x98\x82"
+ "_foo = \xcf\x80"
+ "_bar.\xf0\x9f\x98\x82"
+ "_field\xcf\x80"
+ ";\n"
+ " ^~~~~~ ~~~~~ ~~~~~~~~~\n"
+ " | | |\n"
+ " | | c\n"
+ " aaaaa\xf0\x9f\x98\x82\xcf\x80"
+ " bb\xf0\x9f\x98\x82\xf0\x9f\x98\x82\n",
+ pp_formatted_text (dc.printer));
+ }
+}
+
+/* Run the various one-liner tests. */
+
+static void
+test_diagnostic_show_locus_one_liner_utf8 (const line_table_case &case_)
+{
+ /* Create a tempfile and write some text to it. */
+ const char *content
+ /* Display columns.
+ 0000000000000000000000011111111111111111111111111111112222222222222
+ 1111111122222222345678900000000123456666666677777777890123444444445 */
+ = "\xf0\x9f\x98\x82_foo = \xcf\x80_bar.\xf0\x9f\x98\x82_field\xcf\x80;\n";
+ /* 0000000000000000000001111111111111111111222222222222222222222233333
+ 1111222233334444567890122223333456789999000011112222345678999900001
+ Byte columns. */
+ temp_source_file tmp (SELFTEST_LOCATION, ".c", content);
+ line_table_test ltt (case_);
+
+ linemap_add (line_table, LC_ENTER, false, tmp.get_filename (), 1);
+
+ location_t line_end = linemap_position_for_column (line_table, 31);
+
+ /* Don't attempt to run the tests if column data might be unavailable. */
+ if (line_end > LINE_MAP_MAX_LOCATION_WITH_COLS)
+ return;
+
+ ASSERT_STREQ (tmp.get_filename (), LOCATION_FILE (line_end));
+ ASSERT_EQ (1, LOCATION_LINE (line_end));
+ ASSERT_EQ (31, LOCATION_COLUMN (line_end));
+
+ char_span lspan = location_get_source_line (tmp.get_filename (), 1);
+ ASSERT_EQ (25, cpp_display_width (lspan.get_buffer (), lspan.length ()));
+ ASSERT_EQ (25, location_compute_display_column (expand_location (line_end)));
+
+ test_one_liner_simple_caret_utf8 ();
+ test_one_liner_caret_and_range_utf8 ();
+ test_one_liner_multiple_carets_and_ranges_utf8 ();
+ test_one_liner_fixit_insert_before_utf8 ();
+ test_one_liner_fixit_insert_after_utf8 ();
+ test_one_liner_fixit_remove_utf8 ();
+ test_one_liner_fixit_replace_utf8 ();
+ test_one_liner_fixit_replace_non_equal_range_utf8 ();
+ test_one_liner_fixit_replace_equal_secondary_range_utf8 ();
+ test_one_liner_fixit_validation_adhoc_locations_utf8 ();
+ test_one_liner_many_fixits_1_utf8 ();
+ test_one_liner_many_fixits_2_utf8 ();
+ test_one_liner_labels_utf8 ();
+}
+
/* Verify that gcc_rich_location::add_location_if_nearby works. */
static void
@@ -3221,13 +3901,16 @@ test_overlapped_fixit_printing (const line_table_case &case_)
/* Unit-test the line_corrections machinery. */
ASSERT_EQ (3, richloc.get_num_fixit_hints ());
const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
- ASSERT_EQ (column_range (12, 12), get_affected_columns (hint_0));
+ ASSERT_EQ (column_range (12, 12), get_affected_range (hint_0, false));
+ ASSERT_EQ (column_range (12, 12), get_affected_range (hint_0, true));
ASSERT_EQ (column_range (12, 22), get_printed_columns (hint_0));
const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
- ASSERT_EQ (column_range (18, 18), get_affected_columns (hint_1));
+ ASSERT_EQ (column_range (18, 18), get_affected_range (hint_1, false));
+ ASSERT_EQ (column_range (18, 18), get_affected_range (hint_1, true));
ASSERT_EQ (column_range (18, 20), get_printed_columns (hint_1));
const fixit_hint *hint_2 = richloc.get_fixit_hint (2);
- ASSERT_EQ (column_range (29, 28), get_affected_columns (hint_2));
+ ASSERT_EQ (column_range (29, 28), get_affected_range (hint_2, false));
+ ASSERT_EQ (column_range (29, 28), get_affected_range (hint_2, true));
ASSERT_EQ (column_range (29, 29), get_printed_columns (hint_2));
/* Add each hint in turn to a line_corrections instance,
@@ -3238,6 +3921,7 @@ test_overlapped_fixit_printing (const line_table_case &case_)
/* The first replace hint by itself. */
lc.add_hint (hint_0);
ASSERT_EQ (1, lc.m_corrections.length ());
+ ASSERT_EQ (column_range (12, 12), lc.m_corrections[0]->m_affected_bytes);
ASSERT_EQ (column_range (12, 12), lc.m_corrections[0]->m_affected_columns);
ASSERT_EQ (column_range (12, 22), lc.m_corrections[0]->m_printed_columns);
ASSERT_STREQ ("const_cast<", lc.m_corrections[0]->m_text);
@@ -3247,6 +3931,7 @@ test_overlapped_fixit_printing (const line_table_case &case_)
lc.add_hint (hint_1);
ASSERT_EQ (1, lc.m_corrections.length ());
ASSERT_STREQ ("const_cast (", lc.m_corrections[0]->m_text);
+ ASSERT_EQ (column_range (12, 18), lc.m_corrections[0]->m_affected_bytes);
ASSERT_EQ (column_range (12, 18), lc.m_corrections[0]->m_affected_columns);
ASSERT_EQ (column_range (12, 30), lc.m_corrections[0]->m_printed_columns);
@@ -3256,6 +3941,7 @@ test_overlapped_fixit_printing (const line_table_case &case_)
ASSERT_STREQ ("const_cast (ptr->field)",
lc.m_corrections[0]->m_text);
ASSERT_EQ (1, lc.m_corrections.length ());
+ ASSERT_EQ (column_range (12, 28), lc.m_corrections[0]->m_affected_bytes);
ASSERT_EQ (column_range (12, 28), lc.m_corrections[0]->m_affected_columns);
ASSERT_EQ (column_range (12, 41), lc.m_corrections[0]->m_printed_columns);
}
@@ -3358,6 +4044,243 @@ test_overlapped_fixit_printing (const line_table_case &case_)
}
}
+/* Multibyte-aware version of preceding tests. See comments above
+ test_one_liner_simple_caret_utf8() too, we use the same two multibyte
+ characters here. */
+
+static void
+test_overlapped_fixit_printing_utf8 (const line_table_case &case_)
+{
+ /* Create a tempfile and write some text to it. */
+
+ const char *content
+ /* Display columns.
+ 00000000000000000000000111111111111111111111111222222222222222223
+ 12344444444555555556789012344444444555555556789012345678999999990 */
+ = " f\xf0\x9f\x98\x82 *f = (f\xf0\x9f\x98\x82 *)ptr->field\xcf\x80;\n";
+ /* 00000000000000000000011111111111111111111112222222222333333333333
+ 12344445555666677778901234566667777888899990123456789012333344445
+ Byte columns. */
+
+ temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
+ line_table_test ltt (case_);
+
+ const line_map_ordinary *ord_map
+ = linemap_check_ordinary (linemap_add (line_table, LC_ENTER, false,
+ tmp.get_filename (), 0));
+
+ linemap_line_start (line_table, 1, 100);
+
+ const location_t final_line_end
+ = linemap_position_for_line_and_column (line_table, ord_map, 6, 50);
+
+ /* Don't attempt to run the tests if column data might be unavailable. */
+ if (final_line_end > LINE_MAP_MAX_LOCATION_WITH_COLS)
+ return;
+
+ /* A test for converting a C-style cast to a C++-style cast. */
+ const location_t open_paren
+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 14);
+ const location_t close_paren
+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 22);
+ const location_t expr_start
+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 23);
+ const location_t expr_finish
+ = linemap_position_for_line_and_column (line_table, ord_map, 1, 34);
+ const location_t expr = make_location (expr_start, expr_start, expr_finish);
+
+ /* Various examples of fix-it hints that aren't themselves consolidated,
+ but for which the *printing* may need consolidation. */
+
+ /* Example where 3 fix-it hints are printed as one. */
+ {
+ test_diagnostic_context dc;
+ rich_location richloc (line_table, expr);
+ richloc.add_fixit_replace (open_paren, "const_cast<");
+ richloc.add_fixit_replace (close_paren, "> (");
+ richloc.add_fixit_insert_after (")");
+
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " f\xf0\x9f\x98\x82"
+ " *f = (f\xf0\x9f\x98\x82"
+ " *)ptr->field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~~~\n"
+ " ------------------\n"
+ " const_cast (ptr->field\xcf\x80"
+ ")\n",
+ pp_formatted_text (dc.printer));
+
+ /* Unit-test the line_corrections machinery. */
+ ASSERT_EQ (3, richloc.get_num_fixit_hints ());
+ const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
+ ASSERT_EQ (column_range (14, 14), get_affected_range (hint_0, false));
+ ASSERT_EQ (column_range (12, 12), get_affected_range (hint_0, true));
+ ASSERT_EQ (column_range (12, 22), get_printed_columns (hint_0));
+ const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
+ ASSERT_EQ (column_range (22, 22), get_affected_range (hint_1, false));
+ ASSERT_EQ (column_range (18, 18), get_affected_range (hint_1, true));
+ ASSERT_EQ (column_range (18, 20), get_printed_columns (hint_1));
+ const fixit_hint *hint_2 = richloc.get_fixit_hint (2);
+ ASSERT_EQ (column_range (35, 34), get_affected_range (hint_2, false));
+ ASSERT_EQ (column_range (30, 29), get_affected_range (hint_2, true));
+ ASSERT_EQ (column_range (30, 30), get_printed_columns (hint_2));
+
+ /* Add each hint in turn to a line_corrections instance,
+ and verify that they are consolidated into one correction instance
+ as expected. */
+ line_corrections lc (tmp.get_filename (), 1);
+
+ /* The first replace hint by itself. */
+ lc.add_hint (hint_0);
+ ASSERT_EQ (1, lc.m_corrections.length ());
+ ASSERT_EQ (column_range (14, 14), lc.m_corrections[0]->m_affected_bytes);
+ ASSERT_EQ (column_range (12, 12), lc.m_corrections[0]->m_affected_columns);
+ ASSERT_EQ (column_range (12, 22), lc.m_corrections[0]->m_printed_columns);
+ ASSERT_STREQ ("const_cast<", lc.m_corrections[0]->m_text);
+
+ /* After the second replacement hint, they are printed together
+ as a replacement (along with the text between them). */
+ lc.add_hint (hint_1);
+ ASSERT_EQ (1, lc.m_corrections.length ());
+ ASSERT_STREQ ("const_cast (",
+ lc.m_corrections[0]->m_text);
+ ASSERT_EQ (column_range (14, 22), lc.m_corrections[0]->m_affected_bytes);
+ ASSERT_EQ (column_range (12, 18), lc.m_corrections[0]->m_affected_columns);
+ ASSERT_EQ (column_range (12, 30), lc.m_corrections[0]->m_printed_columns);
+
+ /* After the final insertion hint, they are all printed together
+ as a replacement (along with the text between them). */
+ lc.add_hint (hint_2);
+ ASSERT_STREQ ("const_cast (ptr->field\xcf\x80)",
+ lc.m_corrections[0]->m_text);
+ ASSERT_EQ (1, lc.m_corrections.length ());
+ ASSERT_EQ (column_range (14, 34), lc.m_corrections[0]->m_affected_bytes);
+ ASSERT_EQ (column_range (12, 29), lc.m_corrections[0]->m_affected_columns);
+ ASSERT_EQ (column_range (12, 42), lc.m_corrections[0]->m_printed_columns);
+ }
+
+ /* Example where two are consolidated during printing. */
+ {
+ test_diagnostic_context dc;
+ rich_location richloc (line_table, expr);
+ richloc.add_fixit_replace (open_paren, "CAST (");
+ richloc.add_fixit_replace (close_paren, ") (");
+ richloc.add_fixit_insert_after (")");
+
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " f\xf0\x9f\x98\x82"
+ " *f = (f\xf0\x9f\x98\x82"
+ " *)ptr->field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~~~\n"
+ " -\n"
+ " CAST (-\n"
+ " ) ( )\n",
+ pp_formatted_text (dc.printer));
+ }
+
+ /* Example where none are consolidated during printing. */
+ {
+ test_diagnostic_context dc;
+ rich_location richloc (line_table, expr);
+ richloc.add_fixit_replace (open_paren, "CST (");
+ richloc.add_fixit_replace (close_paren, ") (");
+ richloc.add_fixit_insert_after (")");
+
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " f\xf0\x9f\x98\x82"
+ " *f = (f\xf0\x9f\x98\x82"
+ " *)ptr->field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~~~\n"
+ " -\n"
+ " CST ( -\n"
+ " ) ( )\n",
+ pp_formatted_text (dc.printer));
+ }
+
+ /* Example of deletion fix-it hints. */
+ {
+ test_diagnostic_context dc;
+ rich_location richloc (line_table, expr);
+ richloc.add_fixit_insert_before (open_paren, "(bar\xf0\x9f\x98\x82 *)");
+ source_range victim = {open_paren, close_paren};
+ richloc.add_fixit_remove (victim);
+
+ /* This case is actually handled by fixit-consolidation,
+ rather than by line_corrections. */
+ ASSERT_EQ (1, richloc.get_num_fixit_hints ());
+
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " f\xf0\x9f\x98\x82"
+ " *f = (f\xf0\x9f\x98\x82"
+ " *)ptr->field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~~~\n"
+ " -------\n"
+ " (bar\xf0\x9f\x98\x82"
+ " *)\n",
+ pp_formatted_text (dc.printer));
+ }
+
+ /* Example of deletion fix-it hints that would overlap. */
+ {
+ test_diagnostic_context dc;
+ rich_location richloc (line_table, expr);
+ richloc.add_fixit_insert_before (open_paren, "(long\xf0\x9f\x98\x82 *)");
+ source_range victim = {expr_start, expr_finish};
+ richloc.add_fixit_remove (victim);
+
+ /* These fixits are not consolidated. */
+ ASSERT_EQ (2, richloc.get_num_fixit_hints ());
+
+ /* But the corrections are. */
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " f\xf0\x9f\x98\x82"
+ " *f = (f\xf0\x9f\x98\x82"
+ " *)ptr->field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~~~\n"
+ " ------------------\n"
+ " (long\xf0\x9f\x98\x82"
+ " *)(f\xf0\x9f\x98\x82"
+ " *)\n",
+ pp_formatted_text (dc.printer));
+ }
+
+ /* Example of insertion fix-it hints that would overlap. */
+ {
+ test_diagnostic_context dc;
+ rich_location richloc (line_table, expr);
+ richloc.add_fixit_insert_before
+ (open_paren, "L\xf0\x9f\x98\x82NGER THAN THE CAST");
+ richloc.add_fixit_insert_after (close_paren, "TEST");
+
+ /* The first insertion is long enough that if printed naively,
+ it would overlap with the second.
+ Verify that they are printed as a single replacement. */
+ diagnostic_show_locus (&dc, &richloc, DK_ERROR);
+ ASSERT_STREQ ("\n"
+ " f\xf0\x9f\x98\x82"
+ " *f = (f\xf0\x9f\x98\x82"
+ " *)ptr->field\xcf\x80"
+ ";\n"
+ " ^~~~~~~~~~~\n"
+ " -------\n"
+ " L\xf0\x9f\x98\x82"
+ "NGER THAN THE CAST(f\xf0\x9f\x98\x82"
+ " *)TEST\n",
+ pp_formatted_text (dc.printer));
+ }
+}
+
/* Verify that the line_corrections machinery correctly prints
overlapping fixit-hints that have been added in the wrong
order.
@@ -3407,10 +4330,10 @@ test_overlapped_fixit_printing_2 (const line_table_case &case_)
/* These fixits should be accepted; they can't be consolidated. */
ASSERT_EQ (2, richloc.get_num_fixit_hints ());
const fixit_hint *hint_0 = richloc.get_fixit_hint (0);
- ASSERT_EQ (column_range (23, 22), get_affected_columns (hint_0));
+ ASSERT_EQ (column_range (23, 22), get_affected_range (hint_0, false));
ASSERT_EQ (column_range (23, 23), get_printed_columns (hint_0));
const fixit_hint *hint_1 = richloc.get_fixit_hint (1);
- ASSERT_EQ (column_range (21, 20), get_affected_columns (hint_1));
+ ASSERT_EQ (column_range (21, 20), get_affected_range (hint_1, false));
ASSERT_EQ (column_range (21, 21), get_printed_columns (hint_1));
/* Verify that they're printed correctly. */
@@ -3737,10 +4660,12 @@ diagnostic_show_locus_c_tests ()
test_diagnostic_show_locus_unknown_location ();
for_each_line_table_case (test_diagnostic_show_locus_one_liner);
+ for_each_line_table_case (test_diagnostic_show_locus_one_liner_utf8);
for_each_line_table_case (test_add_location_if_nearby);
for_each_line_table_case (test_diagnostic_show_locus_fixit_lines);
for_each_line_table_case (test_fixit_consolidation);
for_each_line_table_case (test_overlapped_fixit_printing);
+ for_each_line_table_case (test_overlapped_fixit_printing_utf8);
for_each_line_table_case (test_overlapped_fixit_printing_2);
for_each_line_table_case (test_fixit_insert_containing_newline);
for_each_line_table_case (test_fixit_insert_containing_newline_2);
diff --git a/gcc/diagnostic.c b/gcc/diagnostic.c
index 96b6fa30052..8638fbebb2d 100644
--- a/gcc/diagnostic.c
+++ b/gcc/diagnostic.c
@@ -346,9 +346,13 @@ diagnostic_get_location_text (diagnostic_context *context,
const char *locus_cs = colorize_start (pp_show_color (pp), "locus");
const char *locus_ce = colorize_stop (pp_show_color (pp));
const char *file = s.file ? s.file : progname;
- int line = strcmp (file, N_("")) ? s.line : 0;
- int col = context->show_column ? s.column : 0;
-
+ int line = 0;
+ int col = 0;
+ if (strcmp (file, N_("")))
+ {
+ line = s.line;
+ col = context->show_column ? location_compute_display_column (s) : 0;
+ }
const char *line_col = maybe_line_and_column (line, col);
return build_message_string ("%s%s%s:%s", locus_cs, file,
line_col, locus_ce);
diff --git a/gcc/input.c b/gcc/input.c
index 00301ef68dd..d2d99000b84 100644
--- a/gcc/input.c
+++ b/gcc/input.c
@@ -908,6 +908,18 @@ make_location (location_t caret, source_range src_range)
return COMBINE_LOCATION_DATA (line_table, pure_loc, src_range, NULL);
}
+int
+location_compute_display_column (expanded_location exploc)
+{
+ if (!(exploc.file && exploc.line && exploc.column))
+ return exploc.column;
+ char_span line = location_get_source_line (exploc.file, exploc.line);
+ /* If line is NULL, this function returns exploc.column which is the
+ desired fallback. */
+ return cpp_byte_column_to_display_column (line.get_buffer (), line.length (),
+ exploc.column);
+}
+
/* Dump statistics to stderr about the memory usage of the line_table
set of line maps. This also displays some statistics about macro
expansion. */
@@ -3590,6 +3602,51 @@ test_line_offset_overflow ()
ASSERT_NE (ordmap_a, ordmap_b);
}
+void test_cpp_utf8 ()
+{
+ /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
+ {
+ int w_bad = cpp_display_width ("\xf0!\x9f!\x98!\x82!", 8);
+ ASSERT_EQ (8, w_bad);
+ int w_ctrl = cpp_display_width ("\r\t\n\v\0\1", 6);
+ ASSERT_EQ (6, w_ctrl);
+ }
+
+ /* Verify that wcwidth of valid UTF-8 is as expected. */
+ {
+ const int w_pi = cpp_display_width ("\xcf\x80", 2);
+ ASSERT_EQ (1, w_pi);
+ const int w_emoji = cpp_display_width ("\xf0\x9f\x98\x82", 4);
+ ASSERT_EQ (2, w_emoji);
+ const int w_ascii = cpp_display_width ("GCC", 3);
+ ASSERT_EQ (3, w_ascii);
+ const int w_mixed
+ = cpp_display_width ("\xcf\x80 = 3.14 \xf0\x9f\x98\x82 \x9f!", 17);
+ ASSERT_EQ (14, w_mixed);
+ }
+
+ /* Verify that cpp_byte_column_to_display_column can go past the end,
+ and similar edge cases. */
+ {
+ const char *str = "\xcf\x80 abc";
+ ASSERT_EQ (5, cpp_display_width (str, 6));
+ ASSERT_EQ (105, cpp_byte_column_to_display_column (str, 6, 106));
+ ASSERT_EQ (10000, cpp_byte_column_to_display_column (NULL, 0, 10000));
+ ASSERT_EQ (0, cpp_byte_column_to_display_column (NULL, 10000, 0));
+ }
+
+ /* Verify that cpp_display_column_to_byte_column can go past the end,
+ and similar edge cases. */
+ {
+ const char *str = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
+ ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2));
+ ASSERT_EQ (15, cpp_display_column_to_byte_column (str, 15, 11));
+ ASSERT_EQ (115, cpp_display_column_to_byte_column (str, 15, 111));
+ ASSERT_EQ (10000, cpp_display_column_to_byte_column (NULL, 0, 10000));
+ ASSERT_EQ (0, cpp_display_column_to_byte_column (NULL, 10000, 0));
+ }
+}
+
/* Run all of the selftests within this file. */
void
@@ -3631,6 +3688,8 @@ input_c_tests ()
test_reading_source_line ();
test_line_offset_overflow ();
+
+ test_cpp_utf8 ();
}
} // namespace selftest
diff --git a/gcc/input.h b/gcc/input.h
index c459bf28553..35e02bd91d5 100644
--- a/gcc/input.h
+++ b/gcc/input.h
@@ -38,6 +38,7 @@ STATIC_ASSERT (BUILTINS_LOCATION < RESERVED_LOCATION_COUNT);
extern bool is_location_from_builtin_token (location_t);
extern expanded_location expand_location (location_t);
+extern int location_compute_display_column (expanded_location);
/* A class capturing the bounds of a buffer, to allow for run-time
bounds-checking in a checked build. */
diff --git a/libcpp/charset.c b/libcpp/charset.c
index 39af77a554a..d1bdff095eb 100644
--- a/libcpp/charset.c
+++ b/libcpp/charset.c
@@ -2257,3 +2257,106 @@ cpp_string_location_reader::get_next ()
m_loc += m_offset_per_column;
return result;
}
+
+/* Helper for cpp_byte_column_to_display_column and its inverse. Given a
+ pointer to a UTF-8-encoded character, compute its display width. *INBUFP
+ points on entry to the start of the UTF-8 encoding of the character, and
+ is updated to point just after the last byte of the encoding. *INBYTESLEFTP
+ contains on entry the remaining size of the buffer into which *INBUFP
+ points, and this is also updated accordingly. If *INBUFP does not
+ point to a valid UTF-8-encoded sequence, then it will be treated as a single
+ byte with display width 1. */
+
+static inline int
+compute_next_display_width (const uchar **inbufp, size_t *inbytesleftp)
+{
+ cppchar_t c;
+ if (one_utf8_to_cppchar (inbufp, inbytesleftp, &c) != 0)
+ {
+ /* Input is not convertible to UTF-8. This could be fine, e.g. in a
+ string literal, so don't complain. Just treat it as if it has a width
+ of one. */
+ ++*inbufp;
+ --*inbytesleftp;
+ return 1;
+ }
+
+ /* one_utf8_to_cppchar() has updated inbufp and inbytesleftp for us. */
+ return cpp_wcwidth (c);
+}
+
+/* For the string of length DATA_LENGTH bytes that begins at DATA, compute
+ how many display columns are occupied by the first COLUMN bytes. COLUMN
+ may exceed DATA_LENGTH, in which case the phantom bytes at the end are
+ treated as if they have display width 1. */
+
+int
+cpp_byte_column_to_display_column (const char *data, int data_length,
+ int column)
+{
+ int display_col = 0;
+ const uchar *udata = (const uchar *) data;
+ const int offset = MAX (0, column - data_length);
+ size_t inbytesleft = column - offset;
+ while (inbytesleft)
+ display_col += compute_next_display_width (&udata, &inbytesleft);
+ return display_col + offset;
+}
+
+/* For the string of length DATA_LENGTH bytes that begins at DATA, compute
+ the least number of bytes that will result in at least DISPLAY_COL display
+ columns. The return value may exceed DATA_LENGTH if the entire string does
+ not occupy enough display columns. */
+
+int
+cpp_display_column_to_byte_column (const char *data, int data_length,
+ int display_col)
+{
+ int column = 0;
+ const uchar *udata = (const uchar *) data;
+ size_t inbytesleft = data_length;
+ while (column < display_col && inbytesleft)
+ column += compute_next_display_width (&udata, &inbytesleft);
+ return data_length - inbytesleft + (display_col - column);
+}
+
+/* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc,
+ because that will inspect the user's locale, and in particular in an ASCII
+ locale, it will not return anything useful for extended characters. But GCC
+ in other respects (see e.g. _cpp_default_encoding()) behaves as if
+ everything is UTF-8. We also make some tweaks that are useful for the way
+ GCC needs to use this data, e.g. tabs and other control characters should be
+ treated as having width 1. The lookup tables are generated from
+ contrib/unicode/gen_wcwidth.py and were made by simply calling glibc
+ wcwidth() on all codepoints, then applying the small tweaks. These tables
+ are not highly optimized, but for the present purpose of outputting
+ diagnostics, they are sufficient. */
+
+#include "generated_cpp_wcwidth.h"
+int cpp_wcwidth (cppchar_t c)
+{
+ if (__builtin_expect (c <= wcwidth_range_ends[0], true))
+ return wcwidth_widths[0];
+
+ /* Binary search the tables. */
+ int begin = 1;
+ static const int end
+ = sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends);
+ int len = end - begin;
+ do
+ {
+ int half = len/2;
+ int middle = begin + half;
+ if (c > wcwidth_range_ends[middle])
+ {
+ begin = middle + 1;
+ len -= half + 1;
+ }
+ else
+ len = half;
+ } while (len);
+
+ if (__builtin_expect (begin != end, true))
+ return wcwidth_widths[begin];
+ return 1;
+}
diff --git a/libcpp/generated_cpp_wcwidth.h b/libcpp/generated_cpp_wcwidth.h
new file mode 100644
index 00000000000..ec8b73d3d01
--- /dev/null
+++ b/libcpp/generated_cpp_wcwidth.h
@@ -0,0 +1,156 @@
+/* Generated by contrib/unicode/gen_wcwidth.py, with the help of glibc's
+ utf8_gen.py, using version 12.1.0 of the Unicode standard. */
+
+static const cppchar_t wcwidth_range_ends[] = {
+ 0x2ff, 0x36f, 0x482, 0x489, 0x590, 0x5bd, 0x5be, 0x5bf,
+ 0x5c0, 0x5c2, 0x5c3, 0x5c5, 0x5c6, 0x5c7, 0x60f, 0x61a,
+ 0x61b, 0x61c, 0x64a, 0x65f, 0x66f, 0x670, 0x6d5, 0x6dc,
+ 0x6de, 0x6e4, 0x6e6, 0x6e8, 0x6e9, 0x6ed, 0x710, 0x711,
+ 0x72f, 0x74a, 0x7a5, 0x7b0, 0x7ea, 0x7f3, 0x7fc, 0x7fd,
+ 0x815, 0x819, 0x81a, 0x823, 0x824, 0x827, 0x828, 0x82d,
+ 0x858, 0x85b, 0x8d2, 0x8e1, 0x8e2, 0x902, 0x939, 0x93a,
+ 0x93b, 0x93c, 0x940, 0x948, 0x94c, 0x94d, 0x950, 0x957,
+ 0x961, 0x963, 0x980, 0x981, 0x9bb, 0x9bc, 0x9c0, 0x9c4,
+ 0x9cc, 0x9cd, 0x9e1, 0x9e3, 0x9fd, 0x9fe, 0xa00, 0xa02,
+ 0xa3b, 0xa3c, 0xa40, 0xa42, 0xa46, 0xa48, 0xa4a, 0xa4d,
+ 0xa50, 0xa51, 0xa6f, 0xa71, 0xa74, 0xa75, 0xa80, 0xa82,
+ 0xabb, 0xabc, 0xac0, 0xac5, 0xac6, 0xac8, 0xacc, 0xacd,
+ 0xae1, 0xae3, 0xaf9, 0xaff, 0xb00, 0xb01, 0xb3b, 0xb3c,
+ 0xb3e, 0xb3f, 0xb40, 0xb44, 0xb4c, 0xb4d, 0xb55, 0xb56,
+ 0xb61, 0xb63, 0xb81, 0xb82, 0xbbf, 0xbc0, 0xbcc, 0xbcd,
+ 0xbff, 0xc00, 0xc03, 0xc04, 0xc3d, 0xc40, 0xc45, 0xc48,
+ 0xc49, 0xc4d, 0xc54, 0xc56, 0xc61, 0xc63, 0xc80, 0xc81,
+ 0xcbb, 0xcbc, 0xcbe, 0xcbf, 0xcc5, 0xcc6, 0xccb, 0xccd,
+ 0xce1, 0xce3, 0xcff, 0xd01, 0xd3a, 0xd3c, 0xd40, 0xd44,
+ 0xd4c, 0xd4d, 0xd61, 0xd63, 0xdc9, 0xdca, 0xdd1, 0xdd4,
+ 0xdd5, 0xdd6, 0xe30, 0xe31, 0xe33, 0xe3a, 0xe46, 0xe4e,
+ 0xeb0, 0xeb1, 0xeb3, 0xebc, 0xec7, 0xecd, 0xf17, 0xf19,
+ 0xf34, 0xf35, 0xf36, 0xf37, 0xf38, 0xf39, 0xf70, 0xf7e,
+ 0xf7f, 0xf84, 0xf85, 0xf87, 0xf8c, 0xf97, 0xf98, 0xfbc,
+ 0xfc5, 0xfc6, 0x102c, 0x1030, 0x1031, 0x1037, 0x1038, 0x103a,
+ 0x103c, 0x103e, 0x1057, 0x1059, 0x105d, 0x1060, 0x1070, 0x1074,
+ 0x1081, 0x1082, 0x1084, 0x1086, 0x108c, 0x108d, 0x109c, 0x109d,
+ 0x10ff, 0x115f, 0x11ff, 0x135c, 0x135f, 0x1711, 0x1714, 0x1731,
+ 0x1734, 0x1751, 0x1753, 0x1771, 0x1773, 0x17b3, 0x17b5, 0x17b6,
+ 0x17bd, 0x17c5, 0x17c6, 0x17c8, 0x17d3, 0x17dc, 0x17dd, 0x180a,
+ 0x180e, 0x1884, 0x1886, 0x18a8, 0x18a9, 0x191f, 0x1922, 0x1926,
+ 0x1928, 0x1931, 0x1932, 0x1938, 0x193b, 0x1a16, 0x1a18, 0x1a1a,
+ 0x1a1b, 0x1a55, 0x1a56, 0x1a57, 0x1a5e, 0x1a5f, 0x1a60, 0x1a61,
+ 0x1a62, 0x1a64, 0x1a6c, 0x1a72, 0x1a7c, 0x1a7e, 0x1a7f, 0x1aaf,
+ 0x1abe, 0x1aff, 0x1b03, 0x1b33, 0x1b34, 0x1b35, 0x1b3a, 0x1b3b,
+ 0x1b3c, 0x1b41, 0x1b42, 0x1b6a, 0x1b73, 0x1b7f, 0x1b81, 0x1ba1,
+ 0x1ba5, 0x1ba7, 0x1ba9, 0x1baa, 0x1bad, 0x1be5, 0x1be6, 0x1be7,
+ 0x1be9, 0x1bec, 0x1bed, 0x1bee, 0x1bf1, 0x1c2b, 0x1c33, 0x1c35,
+ 0x1c37, 0x1ccf, 0x1cd2, 0x1cd3, 0x1ce0, 0x1ce1, 0x1ce8, 0x1cec,
+ 0x1ced, 0x1cf3, 0x1cf4, 0x1cf7, 0x1cf9, 0x1dbf, 0x1df9, 0x1dfa,
+ 0x1dff, 0x200a, 0x200f, 0x2029, 0x202e, 0x205f, 0x2064, 0x2065,
+ 0x206f, 0x20cf, 0x20f0, 0x2319, 0x231b, 0x2328, 0x232a, 0x23e8,
+ 0x23ec, 0x23ef, 0x23f0, 0x23f2, 0x23f3, 0x25fc, 0x25fe, 0x2613,
+ 0x2615, 0x2647, 0x2653, 0x267e, 0x267f, 0x2692, 0x2693, 0x26a0,
+ 0x26a1, 0x26a9, 0x26ab, 0x26bc, 0x26be, 0x26c3, 0x26c5, 0x26cd,
+ 0x26ce, 0x26d3, 0x26d4, 0x26e9, 0x26ea, 0x26f1, 0x26f3, 0x26f4,
+ 0x26f5, 0x26f9, 0x26fa, 0x26fc, 0x26fd, 0x2704, 0x2705, 0x2709,
+ 0x270b, 0x2727, 0x2728, 0x274b, 0x274c, 0x274d, 0x274e, 0x2752,
+ 0x2755, 0x2756, 0x2757, 0x2794, 0x2797, 0x27af, 0x27b0, 0x27be,
+ 0x27bf, 0x2b1a, 0x2b1c, 0x2b4f, 0x2b50, 0x2b54, 0x2b55, 0x2cee,
+ 0x2cf1, 0x2d7e, 0x2d7f, 0x2ddf, 0x2dff, 0x2e7f, 0x2e99, 0x2e9a,
+ 0x2ef3, 0x2eff, 0x2fd5, 0x2fef, 0x2ffb, 0x2fff, 0x3029, 0x302d,
+ 0x303e, 0x3040, 0x3096, 0x3098, 0x309a, 0x30ff, 0x3104, 0x312f,
+ 0x3130, 0x318e, 0x318f, 0x31ba, 0x31bf, 0x31e3, 0x31ef, 0x321e,
+ 0x321f, 0x4db5, 0x4dbf, 0x9fef, 0x9fff, 0xa48c, 0xa48f, 0xa4c6,
+ 0xa66e, 0xa672, 0xa673, 0xa67d, 0xa69d, 0xa69f, 0xa6ef, 0xa6f1,
+ 0xa801, 0xa802, 0xa805, 0xa806, 0xa80a, 0xa80b, 0xa824, 0xa826,
+ 0xa8c3, 0xa8c5, 0xa8df, 0xa8f1, 0xa8fe, 0xa8ff, 0xa925, 0xa92d,
+ 0xa946, 0xa951, 0xa95f, 0xa97c, 0xa97f, 0xa982, 0xa9b2, 0xa9b3,
+ 0xa9b5, 0xa9b9, 0xa9bb, 0xa9bd, 0xa9e4, 0xa9e5, 0xaa28, 0xaa2e,
+ 0xaa30, 0xaa32, 0xaa34, 0xaa36, 0xaa42, 0xaa43, 0xaa4b, 0xaa4c,
+ 0xaa7b, 0xaa7c, 0xaaaf, 0xaab0, 0xaab1, 0xaab4, 0xaab6, 0xaab8,
+ 0xaabd, 0xaabf, 0xaac0, 0xaac1, 0xaaeb, 0xaaed, 0xaaf5, 0xaaf6,
+ 0xabe4, 0xabe5, 0xabe7, 0xabe8, 0xabec, 0xabed, 0xabff, 0xd7a3,
+ 0xf8ff, 0xfa6d, 0xfa6f, 0xfad9, 0xfb1d, 0xfb1e, 0xfdff, 0xfe0f,
+ 0xfe19, 0xfe1f, 0xfe2f, 0xfe52, 0xfe53, 0xfe66, 0xfe67, 0xfe6b,
+ 0xfefe, 0xfeff, 0xff00, 0xff60, 0xffdf, 0xffe6, 0xfff8, 0xfffb,
+ 0x101fc, 0x101fd, 0x102df, 0x102e0, 0x10375, 0x1037a, 0x10a00, 0x10a03,
+ 0x10a04, 0x10a06, 0x10a0b, 0x10a0f, 0x10a37, 0x10a3a, 0x10a3e, 0x10a3f,
+ 0x10ae4, 0x10ae6, 0x10d23, 0x10d27, 0x10f45, 0x10f50, 0x11000, 0x11001,
+ 0x11037, 0x11046, 0x1107e, 0x11081, 0x110b2, 0x110b6, 0x110b8, 0x110ba,
+ 0x110ff, 0x11102, 0x11126, 0x1112b, 0x1112c, 0x11134, 0x11172, 0x11173,
+ 0x1117f, 0x11181, 0x111b5, 0x111be, 0x111c8, 0x111cc, 0x1122e, 0x11231,
+ 0x11233, 0x11234, 0x11235, 0x11237, 0x1123d, 0x1123e, 0x112de, 0x112df,
+ 0x112e2, 0x112ea, 0x112ff, 0x11301, 0x1133a, 0x1133c, 0x1133f, 0x11340,
+ 0x11365, 0x1136c, 0x1136f, 0x11374, 0x11437, 0x1143f, 0x11441, 0x11444,
+ 0x11445, 0x11446, 0x1145d, 0x1145e, 0x114b2, 0x114b8, 0x114b9, 0x114ba,
+ 0x114be, 0x114c0, 0x114c1, 0x114c3, 0x115b1, 0x115b5, 0x115bb, 0x115bd,
+ 0x115be, 0x115c0, 0x115db, 0x115dd, 0x11632, 0x1163a, 0x1163c, 0x1163d,
+ 0x1163e, 0x11640, 0x116aa, 0x116ab, 0x116ac, 0x116ad, 0x116af, 0x116b5,
+ 0x116b6, 0x116b7, 0x1171c, 0x1171f, 0x11721, 0x11725, 0x11726, 0x1172b,
+ 0x1182e, 0x11837, 0x11838, 0x1183a, 0x119d3, 0x119d7, 0x119d9, 0x119db,
+ 0x119df, 0x119e0, 0x11a00, 0x11a0a, 0x11a32, 0x11a38, 0x11a3a, 0x11a3e,
+ 0x11a46, 0x11a47, 0x11a50, 0x11a56, 0x11a58, 0x11a5b, 0x11a89, 0x11a96,
+ 0x11a97, 0x11a99, 0x11c2f, 0x11c36, 0x11c37, 0x11c3d, 0x11c3e, 0x11c3f,
+ 0x11c91, 0x11ca7, 0x11ca9, 0x11cb0, 0x11cb1, 0x11cb3, 0x11cb4, 0x11cb6,
+ 0x11d30, 0x11d36, 0x11d39, 0x11d3a, 0x11d3b, 0x11d3d, 0x11d3e, 0x11d45,
+ 0x11d46, 0x11d47, 0x11d8f, 0x11d91, 0x11d94, 0x11d95, 0x11d96, 0x11d97,
+ 0x11ef2, 0x11ef4, 0x1342f, 0x13438, 0x16aef, 0x16af4, 0x16b2f, 0x16b36,
+ 0x16f4e, 0x16f4f, 0x16f8e, 0x16f92, 0x16fdf, 0x16fe3, 0x16fff, 0x187f7,
+ 0x187ff, 0x18af2, 0x1afff, 0x1b11e, 0x1b14f, 0x1b152, 0x1b163, 0x1b167,
+ 0x1b16f, 0x1b2fb, 0x1bc9c, 0x1bc9e, 0x1bc9f, 0x1bca3, 0x1d166, 0x1d169,
+ 0x1d172, 0x1d182, 0x1d184, 0x1d18b, 0x1d1a9, 0x1d1ad, 0x1d241, 0x1d244,
+ 0x1d9ff, 0x1da36, 0x1da3a, 0x1da6c, 0x1da74, 0x1da75, 0x1da83, 0x1da84,
+ 0x1da9a, 0x1da9f, 0x1daa0, 0x1daaf, 0x1dfff, 0x1e006, 0x1e007, 0x1e018,
+ 0x1e01a, 0x1e021, 0x1e022, 0x1e024, 0x1e025, 0x1e02a, 0x1e12f, 0x1e136,
+ 0x1e2eb, 0x1e2ef, 0x1e8cf, 0x1e8d6, 0x1e943, 0x1e94a, 0x1f003, 0x1f004,
+ 0x1f0ce, 0x1f0cf, 0x1f18d, 0x1f18e, 0x1f190, 0x1f19a, 0x1f1ff, 0x1f202,
+ 0x1f20f, 0x1f23b, 0x1f23f, 0x1f248, 0x1f24f, 0x1f251, 0x1f25f, 0x1f265,
+ 0x1f2ff, 0x1f320, 0x1f32c, 0x1f335, 0x1f336, 0x1f37c, 0x1f37d, 0x1f393,
+ 0x1f39f, 0x1f3ca, 0x1f3ce, 0x1f3d3, 0x1f3df, 0x1f3f0, 0x1f3f3, 0x1f3f4,
+ 0x1f3f7, 0x1f43e, 0x1f43f, 0x1f440, 0x1f441, 0x1f4fc, 0x1f4fe, 0x1f53d,
+ 0x1f54a, 0x1f54e, 0x1f54f, 0x1f567, 0x1f579, 0x1f57a, 0x1f594, 0x1f596,
+ 0x1f5a3, 0x1f5a4, 0x1f5fa, 0x1f64f, 0x1f67f, 0x1f6c5, 0x1f6cb, 0x1f6cc,
+ 0x1f6cf, 0x1f6d2, 0x1f6d4, 0x1f6d5, 0x1f6ea, 0x1f6ec, 0x1f6f3, 0x1f6fa,
+ 0x1f7df, 0x1f7eb, 0x1f90c, 0x1f971, 0x1f972, 0x1f976, 0x1f979, 0x1f9a2,
+ 0x1f9a4, 0x1f9aa, 0x1f9ad, 0x1f9ca, 0x1f9cc, 0x1f9ff, 0x1fa6f, 0x1fa73,
+ 0x1fa77, 0x1fa7a, 0x1fa7f, 0x1fa82, 0x1fa8f, 0x1fa95, 0x1ffff, 0x2a6d6,
+ 0x2a6ff, 0x2b734, 0x2b73f, 0x2b81d, 0x2b81f, 0x2cea1, 0x2ceaf, 0x2ebe0,
+ 0x2f7ff, 0x2fa1d, 0xe0000, 0xe0001, 0xe001f, 0xe007f, 0xe00ff, 0xe01ef,
+};
+
+static const unsigned char wcwidth_widths[] = {
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+ 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
+ 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
+ 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1,
+ 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 0, 2, 1, 2, 1, 0, 2, 1, 2,
+ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0,
+ 2, 1, 0, 2, 1, 2, 1, 2, 1, 0, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
+ 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2,
+ 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 0, 1, 0, 1, 0,
+};
diff --git a/libcpp/include/cpplib.h b/libcpp/include/cpplib.h
index a645f8136a6..fdc8badba7d 100644
--- a/libcpp/include/cpplib.h
+++ b/libcpp/include/cpplib.h
@@ -1305,4 +1305,15 @@ extern bool cpp_userdef_char_p
extern const char * cpp_get_userdef_suffix
(const cpp_token *);
+/* In charset.c */
+int cpp_byte_column_to_display_column (const char *data, int data_length,
+ int column);
+inline int cpp_display_width (const char *data, int data_length)
+{
+ return cpp_byte_column_to_display_column (data, data_length, data_length);
+}
+int cpp_display_column_to_byte_column (const char *data, int data_length,
+ int display_col);
+int cpp_wcwidth (cppchar_t c);
+
#endif /* ! LIBCPP_CPPLIB_H */