From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2181) id 3094E38493ED; Fri, 13 Jan 2023 13:42:49 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 3094E38493ED DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1673617369; bh=uL7GCXQS8Cd9dRsbCWnjurPw8HHUWULZh6JWXDaJJYc=; h=From:To:Subject:Date:From; b=YuFJ4iOhd7AUK8rtwzI1KL5nDULIVNSQaEWUukkmw0tW/vd4nZ73biJaW6VYFIkkm zbxpm21E7btZ8l+RptCTsSnlbZP1yOuhBOC2X7PYsHkRvLicOF1w8Inn+Xd9SZXpGr xxnJT31IiWznmq1r1M8AQuIVUrktQPWaeZcFGIDE= MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Content-Type: text/plain; charset="utf-8" From: Jonathan Wakely To: gcc-cvs@gcc.gnu.org, libstdc++-cvs@gcc.gnu.org Subject: [gcc r13-5144] libstdc++: Fix Unicode codecvt and add tests [PR86419] X-Act-Checkin: gcc X-Git-Author: Dimitrij Mijoski X-Git-Refname: refs/heads/master X-Git-Oldrev: e2fc12a5dafadf15d804e1d2541528296e97a847 X-Git-Newrev: 02dab998665dda0f6df31740e8897c42de3d740f Message-Id: <20230113134249.3094E38493ED@sourceware.org> Date: Fri, 13 Jan 2023 13:42:49 +0000 (GMT) List-Id: https://gcc.gnu.org/g:02dab998665dda0f6df31740e8897c42de3d740f commit r13-5144-g02dab998665dda0f6df31740e8897c42de3d740f Author: Dimitrij Mijoski Date: Tue Jan 10 13:58:59 2023 +0100 libstdc++: Fix Unicode codecvt and add tests [PR86419] Fixes the conversion from UTF-8 to UTF-16 to properly return partial instead ok. Fixes the conversion from UTF-16 to UTF-8 to properly return partial instead ok. Fixes the conversion from UTF-8 to UCS-2 to properly return partial instead error. Fixes the conversion from UTF-8 to UCS-2 to treat 4-byte UTF-8 sequences as error just by seeing the leading byte. Fixes UTF-8 decoding for all codecvts so they detect error at the end of the input range when the last code point is also incomplete. libstdc++-v3/ChangeLog: PR libstdc++/86419 * src/c++11/codecvt.cc (read_utf8_code_point): Correctly detect errors in incomplete multibyte sequences. (utf16_in): Remove surrogates parameter. Fix conditions for returning partial. (utf16_out): Fix condition for returning partial. (ucs2_in): Do not pass surrogates argument to utf16_in. * testsuite/22_locale/codecvt/codecvt_unicode.cc: New test. * testsuite/22_locale/codecvt/codecvt_unicode.h: New header for tests. * testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc: New test. Diff: --- libstdc++-v3/src/c++11/codecvt.cc | 36 +- .../testsuite/22_locale/codecvt/codecvt_unicode.cc | 68 ++ .../testsuite/22_locale/codecvt/codecvt_unicode.h | 1269 ++++++++++++++++++++ .../22_locale/codecvt/codecvt_unicode_wchar_t.cc | 59 + 4 files changed, 1414 insertions(+), 18 deletions(-) diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc index 9f8cb767732..03f0bfda972 100644 --- a/libstdc++-v3/src/c++11/codecvt.cc +++ b/libstdc++-v3/src/c++11/codecvt.cc @@ -277,13 +277,15 @@ namespace } else if (c1 < 0xF0) // 3-byte sequence { - if (avail < 3) + if (avail < 2) return incomplete_mb_character; char32_t c2 = (unsigned char) from[1]; if ((c2 & 0xC0) != 0x80) return invalid_mb_sequence; if (c1 == 0xE0 && c2 < 0xA0) // overlong return invalid_mb_sequence; + if (avail < 3) + return incomplete_mb_character; char32_t c3 = (unsigned char) from[2]; if ((c3 & 0xC0) != 0x80) return invalid_mb_sequence; @@ -292,9 +294,9 @@ namespace from += 3; return c; } - else if (c1 < 0xF5) // 4-byte sequence + else if (c1 < 0xF5 && maxcode > 0xFFFF) // 4-byte sequence { - if (avail < 4) + if (avail < 2) return incomplete_mb_character; char32_t c2 = (unsigned char) from[1]; if ((c2 & 0xC0) != 0x80) @@ -302,10 +304,14 @@ namespace if (c1 == 0xF0 && c2 < 0x90) // overlong return invalid_mb_sequence; if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF - return invalid_mb_sequence; + return invalid_mb_sequence; + if (avail < 3) + return incomplete_mb_character; char32_t c3 = (unsigned char) from[2]; if ((c3 & 0xC0) != 0x80) return invalid_mb_sequence; + if (avail < 4) + return incomplete_mb_character; char32_t c4 = (unsigned char) from[3]; if ((c4 & 0xC0) != 0x80) return invalid_mb_sequence; @@ -527,12 +533,11 @@ namespace // Flag indicating whether to process UTF-16 or UCS2 enum class surrogates { allowed, disallowed }; - // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed) - template + // utf8 -> utf16 (or utf8 -> ucs2 if maxcode <= 0xFFFF) + template codecvt_base::result - utf16_in(range& from, range& to, - unsigned long maxcode = max_code_point, codecvt_mode mode = {}, - surrogates s = surrogates::allowed) + utf16_in(range &from, range &to, + unsigned long maxcode = max_code_point, codecvt_mode mode = {}) { read_utf8_bom(from, mode); while (from.size() && to.size()) @@ -540,12 +545,7 @@ namespace auto orig = from; const char32_t codepoint = read_utf8_code_point(from, maxcode); if (codepoint == incomplete_mb_character) - { - if (s == surrogates::allowed) - return codecvt_base::partial; - else - return codecvt_base::error; // No surrogates in UCS2 - } + return codecvt_base::partial; if (codepoint > maxcode) return codecvt_base::error; if (!write_utf16_code_point(to, codepoint, mode)) @@ -554,7 +554,7 @@ namespace return codecvt_base::partial; } } - return codecvt_base::ok; + return from.size() ? codecvt_base::partial : codecvt_base::ok; } // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed) @@ -576,7 +576,7 @@ namespace return codecvt_base::error; // No surrogates in UCS-2 if (from.size() < 2) - return codecvt_base::ok; // stop converting at this point + return codecvt_base::partial; // stop converting at this point const char32_t c2 = from[1]; if (is_low_surrogate(c2)) @@ -629,7 +629,7 @@ namespace { // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit: maxcode = std::min(max_single_utf16_unit, maxcode); - return utf16_in(from, to, maxcode, mode, surrogates::disallowed); + return utf16_in(from, to, maxcode, mode); } // ucs2 -> utf8 diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.cc new file mode 100644 index 00000000000..ae4b6c8968f --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.cc @@ -0,0 +1,68 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include "codecvt_unicode.h" + +#include + +using namespace std; + +void +test_utf8_utf32_codecvts () +{ + using codecvt_c32 = codecvt; + auto loc_c = locale::classic (); + VERIFY (has_facet (loc_c)); + auto &cvt = use_facet (loc_c); + test_utf8_utf32_codecvts (cvt); + + auto cvt_ptr = to_unique_ptr (new codecvt_utf8 ()); + test_utf8_utf32_codecvts (*cvt_ptr); +} + +void +test_utf8_utf16_codecvts () +{ + using codecvt_c16 = codecvt; + auto loc_c = locale::classic (); + VERIFY (has_facet (loc_c)); + auto &cvt = use_facet (loc_c); + test_utf8_utf16_cvts (cvt); + + auto cvt_ptr = to_unique_ptr (new codecvt_utf8_utf16 ()); + test_utf8_utf16_cvts (*cvt_ptr); + + auto cvt_ptr2 = to_unique_ptr (new codecvt_utf8_utf16 ()); + test_utf8_utf16_cvts (*cvt_ptr2); +} + +void +test_utf8_ucs2_codecvts () +{ + auto cvt_ptr = to_unique_ptr (new codecvt_utf8 ()); + test_utf8_ucs2_cvts (*cvt_ptr); +} + +int +main () +{ + test_utf8_utf32_codecvts (); + test_utf8_utf16_codecvts (); + test_utf8_ucs2_codecvts (); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h new file mode 100644 index 00000000000..99d1a46840e --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode.h @@ -0,0 +1,1269 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +#include +#include +#include +#include + +template +std::unique_ptr +to_unique_ptr (T *ptr) +{ + return std::unique_ptr (ptr); +} + +struct test_offsets_ok +{ + size_t in_size, out_size; +}; +struct test_offsets_partial +{ + size_t in_size, out_size, expected_in_next, expected_out_next; +}; + +template struct test_offsets_error +{ + size_t in_size, out_size, expected_in_next, expected_out_next; + CharT replace_char; + size_t replace_pos; +}; + +template +auto constexpr array_size (const T (&)[N]) -> size_t +{ + return N; +} + +template +void +utf8_to_utf32_in_ok (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char32_t exp_literal[] = U"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + std::copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 11, ""); + static_assert (array_size (exp_literal) == 5, ""); + static_assert (array_size (exp) == 5, ""); + VERIFY (char_traits::length (in) == 10); + VERIFY (char_traits::length (exp_literal) == 4); + VERIFY (char_traits::length (exp) == 4); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 4}}; + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } + + for (auto t : offsets) + { + CharT out[array_size (exp)] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res + = cvt.in (state, in, in + t.in_size, in_next, out, end (out), out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template +void +utf8_to_utf32_in_partial (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char32_t exp_literal[] = U"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + std::copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 11, ""); + static_assert (array_size (exp_literal) == 5, ""); + static_assert (array_size (exp) == 5, ""); + VERIFY (char_traits::length (in) == 10); + VERIFY (char_traits::length (exp_literal) == 4); + VERIFY (char_traits::length (exp) == 4); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + + {10, 3, 6, 3}, // no space for fourth CP + {7, 4, 6, 3}, // incomplete fourth CP + {8, 4, 6, 3}, // incomplete fourth CP + {9, 4, 6, 3}, // incomplete fourth CP + {7, 3, 6, 3}, // incomplete fourth CP, and no space for it + {8, 3, 6, 3}, // incomplete fourth CP, and no space for it + {9, 3, 6, 3}, // incomplete fourth CP, and no space for it + }; + + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +utf8_to_utf32_in_error (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char valid_in[] = "bш\uAAAA\U0010AAAA"; + const char32_t exp_literal[] = U"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + std::copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (valid_in) == 11, ""); + static_assert (array_size (exp_literal) == 5, ""); + static_assert (array_size (exp) == 5, ""); + VERIFY (char_traits::length (valid_in) == 10); + VERIFY (char_traits::length (exp_literal) == 4); + VERIFY (char_traits::length (exp) == 4); + + test_offsets_error offsets[] = { + + // replace leading byte with invalid byte + {1, 4, 0, 0, '\xFF', 0}, + {3, 4, 1, 1, '\xFF', 1}, + {6, 4, 3, 2, '\xFF', 3}, + {10, 4, 6, 3, '\xFF', 6}, + + // replace first trailing byte with ASCII byte + {3, 4, 1, 1, 'z', 2}, + {6, 4, 3, 2, 'z', 4}, + {10, 4, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 4, 1, 1, '\xFF', 2}, + {6, 4, 3, 2, '\xFF', 4}, + {10, 4, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte + {6, 4, 3, 2, 'z', 5}, + {10, 4, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 4, 3, 2, '\xFF', 5}, + {10, 4, 6, 3, '\xFF', 8}, + + // replace third trailing byte + {10, 4, 6, 3, 'z', 9}, + {10, 4, 6, 3, '\xFF', 9}, + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 4, 3, 2, 'z', 4}, + {8, 4, 6, 3, 'z', 7}, + {9, 4, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 4, 3, 2, '\xFF', 4}, + {8, 4, 6, 3, '\xFF', 7}, + {9, 4, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 4, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 4, 6, 3, '\xFF', 8}, + }; + for (auto t : offsets) + { + char in[array_size (valid_in)] = {}; + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + char_traits::copy (in, valid_in, array_size (valid_in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +utf8_to_utf32_in (const std::codecvt &cvt) +{ + utf8_to_utf32_in_ok (cvt); + utf8_to_utf32_in_partial (cvt); + utf8_to_utf32_in_error (cvt); +} + +template +void +utf32_to_utf8_out_ok (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char32_t in_literal[] = U"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size (in_literal)] = {}; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 5, ""); + static_assert (array_size (in) == 5, ""); + static_assert (array_size (exp) == 11, ""); + VERIFY (char_traits::length (in_literal) == 4); + VERIFY (char_traits::length (in) == 4); + VERIFY (char_traits::length (exp) == 10); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {4, 10}}; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template +void +utf32_to_utf8_out_partial (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char32_t in_literal[] = U"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size (in_literal)] = {}; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 5, ""); + static_assert (array_size (in) == 5, ""); + static_assert (array_size (exp) == 11, ""); + VERIFY (char_traits::length (in_literal) == 4); + VERIFY (char_traits::length (in) == 4); + VERIFY (char_traits::length (exp) == 10); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + + {4, 6, 3, 6}, // no space for fourth CP + {4, 7, 3, 6}, // no space for fourth CP + {4, 8, 3, 6}, // no space for fourth CP + {4, 9, 3, 6}, // no space for fourth CP + }; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +utf32_to_utf8_out_error (const std::codecvt &cvt) +{ + using namespace std; + const char32_t valid_in[] = U"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + + static_assert (array_size (valid_in) == 5, ""); + static_assert (array_size (exp) == 11, ""); + VERIFY (char_traits::length (valid_in) == 4); + VERIFY (char_traits::length (exp) == 10); + + test_offsets_error offsets[] = {{4, 10, 0, 0, 0x00110000, 0}, + {4, 10, 1, 1, 0x00110000, 1}, + {4, 10, 2, 3, 0x00110000, 2}, + {4, 10, 3, 6, 0x00110000, 3}}; + + for (auto t : offsets) + { + CharT in[array_size (valid_in)] = {}; + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + copy (begin (valid_in), end (valid_in), begin (in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +utf32_to_utf8_out (const std::codecvt &cvt) +{ + utf32_to_utf8_out_ok (cvt); + utf32_to_utf8_out_partial (cvt); + utf32_to_utf8_out_error (cvt); +} + +template +void +test_utf8_utf32_codecvts (const std::codecvt &cvt) +{ + utf8_to_utf32_in (cvt); + utf32_to_utf8_out (cvt); +} + +template +void +utf8_to_utf16_in_ok (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 11, ""); + static_assert (array_size (exp_literal) == 6, ""); + static_assert (array_size (exp) == 6, ""); + VERIFY (char_traits::length (in) == 10); + VERIFY (char_traits::length (exp_literal) == 5); + VERIFY (char_traits::length (exp) == 5); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}, {10, 5}}; + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } + + for (auto t : offsets) + { + CharT out[array_size (exp)] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res + = cvt.in (state, in, in + t.in_size, in_next, out, end (out), out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template +void +utf8_to_utf16_in_partial (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 11, ""); + static_assert (array_size (exp_literal) == 6, ""); + static_assert (array_size (exp) == 6, ""); + VERIFY (char_traits::length (in) == 10); + VERIFY (char_traits::length (exp_literal) == 5); + VERIFY (char_traits::length (exp) == 5); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + + {10, 3, 6, 3}, // no space for fourth CP + {10, 4, 6, 3}, // no space for fourth CP + {7, 5, 6, 3}, // incomplete fourth CP + {8, 5, 6, 3}, // incomplete fourth CP + {9, 5, 6, 3}, // incomplete fourth CP + {7, 3, 6, 3}, // incomplete fourth CP, and no space for it + {8, 3, 6, 3}, // incomplete fourth CP, and no space for it + {9, 3, 6, 3}, // incomplete fourth CP, and no space for it + {7, 4, 6, 3}, // incomplete fourth CP, and no space for it + {8, 4, 6, 3}, // incomplete fourth CP, and no space for it + {9, 4, 6, 3}, // incomplete fourth CP, and no space for it + + }; + + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +utf8_to_utf16_in_error (const std::codecvt &cvt) +{ + using namespace std; + const char valid_in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (valid_in) == 11, ""); + static_assert (array_size (exp_literal) == 6, ""); + static_assert (array_size (exp) == 6, ""); + VERIFY (char_traits::length (valid_in) == 10); + VERIFY (char_traits::length (exp_literal) == 5); + VERIFY (char_traits::length (exp) == 5); + + test_offsets_error offsets[] = { + + // replace leading byte with invalid byte + {1, 5, 0, 0, '\xFF', 0}, + {3, 5, 1, 1, '\xFF', 1}, + {6, 5, 3, 2, '\xFF', 3}, + {10, 5, 6, 3, '\xFF', 6}, + + // replace first trailing byte with ASCII byte + {3, 5, 1, 1, 'z', 2}, + {6, 5, 3, 2, 'z', 4}, + {10, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 5, 1, 1, '\xFF', 2}, + {6, 5, 3, 2, '\xFF', 4}, + {10, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte + {6, 5, 3, 2, 'z', 5}, + {10, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 5, 3, 2, '\xFF', 5}, + {10, 5, 6, 3, '\xFF', 8}, + + // replace third trailing byte + {10, 5, 6, 3, 'z', 9}, + {10, 5, 6, 3, '\xFF', 9}, + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 5, 3, 2, 'z', 4}, + {8, 5, 6, 3, 'z', 7}, + {9, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 5, 3, 2, '\xFF', 4}, + {8, 5, 6, 3, '\xFF', 7}, + {9, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 5, 6, 3, '\xFF', 8}, + }; + for (auto t : offsets) + { + char in[array_size (valid_in)] = {}; + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + char_traits::copy (in, valid_in, array_size (valid_in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +utf8_to_utf16_in (const std::codecvt &cvt) +{ + utf8_to_utf16_in_ok (cvt); + utf8_to_utf16_in_partial (cvt); + utf8_to_utf16_in_error (cvt); +} + +template +void +utf16_to_utf8_out_ok (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char16_t in_literal[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size (in_literal)]; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 6, ""); + static_assert (array_size (exp) == 11, ""); + static_assert (array_size (in) == 6, ""); + VERIFY (char_traits::length (in_literal) == 5); + VERIFY (char_traits::length (exp) == 10); + VERIFY (char_traits::length (in) == 5); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}, {5, 10}}; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template +void +utf16_to_utf8_out_partial (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP, 3-byte CP and 4-byte CP + const char16_t in_literal[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + CharT in[array_size (in_literal)]; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 6, ""); + static_assert (array_size (exp) == 11, ""); + static_assert (array_size (in) == 6, ""); + VERIFY (char_traits::length (in_literal) == 5); + VERIFY (char_traits::length (exp) == 10); + VERIFY (char_traits::length (in) == 5); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + + {5, 6, 3, 6}, // no space for fourth CP + {5, 7, 3, 6}, // no space for fourth CP + {5, 8, 3, 6}, // no space for fourth CP + {5, 9, 3, 6}, // no space for fourth CP + + {4, 10, 3, 6}, // incomplete fourth CP + + {4, 6, 3, 6}, // incomplete fourth CP, and no space for it + {4, 7, 3, 6}, // incomplete fourth CP, and no space for it + {4, 8, 3, 6}, // incomplete fourth CP, and no space for it + {4, 9, 3, 6}, // incomplete fourth CP, and no space for it + }; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +utf16_to_utf8_out_error (const std::codecvt &cvt) +{ + using namespace std; + const char16_t valid_in[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + + static_assert (array_size (valid_in) == 6, ""); + static_assert (array_size (exp) == 11, ""); + VERIFY (char_traits::length (valid_in) == 5); + VERIFY (char_traits::length (exp) == 10); + + test_offsets_error offsets[] = { + {5, 10, 0, 0, 0xD800, 0}, + {5, 10, 0, 0, 0xDBFF, 0}, + {5, 10, 0, 0, 0xDC00, 0}, + {5, 10, 0, 0, 0xDFFF, 0}, + + {5, 10, 1, 1, 0xD800, 1}, + {5, 10, 1, 1, 0xDBFF, 1}, + {5, 10, 1, 1, 0xDC00, 1}, + {5, 10, 1, 1, 0xDFFF, 1}, + + {5, 10, 2, 3, 0xD800, 2}, + {5, 10, 2, 3, 0xDBFF, 2}, + {5, 10, 2, 3, 0xDC00, 2}, + {5, 10, 2, 3, 0xDFFF, 2}, + + // make the leading surrogate a trailing one + {5, 10, 3, 6, 0xDC00, 3}, + {5, 10, 3, 6, 0xDFFF, 3}, + + // make the trailing surrogate a leading one + {5, 10, 3, 6, 0xD800, 4}, + {5, 10, 3, 6, 0xDBFF, 4}, + + // make the trailing surrogate a BMP char + {5, 10, 3, 6, u'z', 4}, + }; + + for (auto t : offsets) + { + CharT in[array_size (valid_in)] = {}; + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + copy (begin (valid_in), end (valid_in), begin (in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +utf16_to_utf8_out (const std::codecvt &cvt) +{ + utf16_to_utf8_out_ok (cvt); + utf16_to_utf8_out_partial (cvt); + utf16_to_utf8_out_error (cvt); +} + +template +void +test_utf8_utf16_cvts (const std::codecvt &cvt) +{ + utf8_to_utf16_in (cvt); + utf16_to_utf8_out (cvt); +} + +template +void +utf8_to_ucs2_in_ok (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char in[] = "bш\uAAAA"; + const char16_t exp_literal[] = u"bш\uAAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 7, ""); + static_assert (array_size (exp_literal) == 4, ""); + static_assert (array_size (exp) == 4, ""); + VERIFY (char_traits::length (in) == 6); + VERIFY (char_traits::length (exp_literal) == 3); + VERIFY (char_traits::length (exp) == 3); + + test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {3, 2}, {6, 3}}; + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } + + for (auto t : offsets) + { + CharT out[array_size (exp)] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res + = cvt.in (state, in, in + t.in_size, in_next, out, end (out), out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template +void +utf8_to_ucs2_in_partial (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char in[] = "bш\uAAAA"; + const char16_t exp_literal[] = u"bш\uAAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (in) == 7, ""); + static_assert (array_size (exp_literal) == 4, ""); + static_assert (array_size (exp) == 4, ""); + VERIFY (char_traits::length (in) == 6); + VERIFY (char_traits::length (exp_literal) == 3); + VERIFY (char_traits::length (exp) == 3); + + test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {3, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // incomplete second CP + {2, 1, 1, 1}, // incomplete second CP, and no space for it + + {6, 2, 3, 2}, // no space for third CP + {4, 3, 3, 2}, // incomplete third CP + {5, 3, 3, 2}, // incomplete third CP + {4, 2, 3, 2}, // incomplete third CP, and no space for it + {5, 2, 3, 2}, // incomplete third CP, and no space for it + }; + + for (auto t : offsets) + { + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +utf8_to_ucs2_in_error (const std::codecvt &cvt) +{ + using namespace std; + const char valid_in[] = "bш\uAAAA\U0010AAAA"; + const char16_t exp_literal[] = u"bш\uAAAA\U0010AAAA"; + CharT exp[array_size (exp_literal)] = {}; + copy (begin (exp_literal), end (exp_literal), begin (exp)); + + static_assert (array_size (valid_in) == 11, ""); + static_assert (array_size (exp_literal) == 6, ""); + static_assert (array_size (exp) == 6, ""); + VERIFY (char_traits::length (valid_in) == 10); + VERIFY (char_traits::length (exp_literal) == 5); + VERIFY (char_traits::length (exp) == 5); + + test_offsets_error offsets[] = { + + // replace leading byte with invalid byte + {1, 5, 0, 0, '\xFF', 0}, + {3, 5, 1, 1, '\xFF', 1}, + {6, 5, 3, 2, '\xFF', 3}, + {10, 5, 6, 3, '\xFF', 6}, + + // replace first trailing byte with ASCII byte + {3, 5, 1, 1, 'z', 2}, + {6, 5, 3, 2, 'z', 4}, + {10, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte + {3, 5, 1, 1, '\xFF', 2}, + {6, 5, 3, 2, '\xFF', 4}, + {10, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte + {6, 5, 3, 2, 'z', 5}, + {10, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte + {6, 5, 3, 2, '\xFF', 5}, + {10, 5, 6, 3, '\xFF', 8}, + + // replace third trailing byte + {10, 5, 6, 3, 'z', 9}, + {10, 5, 6, 3, '\xFF', 9}, + + // When we see a leading byte of 4-byte CP, we should return error, no + // matter if it is incomplete at the end or has errors in the trailing + // bytes. + + // Don't replace anything, show full 4-byte CP + {10, 4, 6, 3, 'b', 0}, + {10, 5, 6, 3, 'b', 0}, + + // Don't replace anything, show incomplete 4-byte CP at the end + {7, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {8, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {9, 4, 6, 3, 'b', 0}, // incomplete fourth CP + {7, 5, 6, 3, 'b', 0}, // incomplete fourth CP + {8, 5, 6, 3, 'b', 0}, // incomplete fourth CP + {9, 5, 6, 3, 'b', 0}, // incomplete fourth CP + + // replace first trailing byte with ASCII byte, also incomplete at end + {5, 5, 3, 2, 'z', 4}, + + // replace first trailing byte with invalid byte, also incomplete at end + {5, 5, 3, 2, '\xFF', 4}, + + // replace first trailing byte with ASCII byte, also incomplete at end + {8, 5, 6, 3, 'z', 7}, + {9, 5, 6, 3, 'z', 7}, + + // replace first trailing byte with invalid byte, also incomplete at end + {8, 5, 6, 3, '\xFF', 7}, + {9, 5, 6, 3, '\xFF', 7}, + + // replace second trailing byte with ASCII byte, also incomplete at end + {9, 5, 6, 3, 'z', 8}, + + // replace second trailing byte with invalid byte, also incomplete at end + {9, 5, 6, 3, '\xFF', 8}, + }; + for (auto t : offsets) + { + char in[array_size (valid_in)] = {}; + CharT out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + char_traits::copy (in, valid_in, array_size (valid_in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const char *) nullptr; + auto out_next = (CharT *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.in (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +utf8_to_ucs2_in (const std::codecvt &cvt) +{ + utf8_to_ucs2_in_ok (cvt); + utf8_to_ucs2_in_partial (cvt); + utf8_to_ucs2_in_error (cvt); +} + +template +void +ucs2_to_utf8_out_ok (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char16_t in_literal[] = u"bш\uAAAA"; + const char exp[] = "bш\uAAAA"; + CharT in[array_size (in_literal)] = {}; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 4, ""); + static_assert (array_size (exp) == 7, ""); + static_assert (array_size (in) == 4, ""); + VERIFY (char_traits::length (in_literal) == 3); + VERIFY (char_traits::length (exp) == 6); + VERIFY (char_traits::length (in) == 3); + + const test_offsets_ok offsets[] = {{0, 0}, {1, 1}, {2, 3}, {3, 6}}; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.ok); + VERIFY (in_next == in + t.in_size); + VERIFY (out_next == out + t.out_size); + VERIFY (char_traits::compare (out, exp, t.out_size) == 0); + if (t.out_size < array_size (out)) + VERIFY (out[t.out_size] == 0); + } +} + +template +void +ucs2_to_utf8_out_partial (const std::codecvt &cvt) +{ + using namespace std; + // UTF-8 string of 1-byte CP, 2-byte CP and 3-byte CP + const char16_t in_literal[] = u"bш\uAAAA"; + const char exp[] = "bш\uAAAA"; + CharT in[array_size (in_literal)] = {}; + copy (begin (in_literal), end (in_literal), begin (in)); + + static_assert (array_size (in_literal) == 4, ""); + static_assert (array_size (exp) == 7, ""); + static_assert (array_size (in) == 4, ""); + VERIFY (char_traits::length (in_literal) == 3); + VERIFY (char_traits::length (exp) == 6); + VERIFY (char_traits::length (in) == 3); + + const test_offsets_partial offsets[] = { + {1, 0, 0, 0}, // no space for first CP + + {2, 1, 1, 1}, // no space for second CP + {2, 2, 1, 1}, // no space for second CP + + {3, 3, 2, 3}, // no space for third CP + {3, 4, 2, 3}, // no space for third CP + {3, 5, 2, 3}, // no space for third CP + }; + for (auto t : offsets) + { + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.partial); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +ucs2_to_utf8_out_error (const std::codecvt &cvt) +{ + using namespace std; + const char16_t valid_in[] = u"bш\uAAAA\U0010AAAA"; + const char exp[] = "bш\uAAAA\U0010AAAA"; + + static_assert (array_size (valid_in) == 6, ""); + static_assert (array_size (exp) == 11, ""); + VERIFY (char_traits::length (valid_in) == 5); + VERIFY (char_traits::length (exp) == 10); + + test_offsets_error offsets[] = { + {5, 10, 0, 0, 0xD800, 0}, + {5, 10, 0, 0, 0xDBFF, 0}, + {5, 10, 0, 0, 0xDC00, 0}, + {5, 10, 0, 0, 0xDFFF, 0}, + + {5, 10, 1, 1, 0xD800, 1}, + {5, 10, 1, 1, 0xDBFF, 1}, + {5, 10, 1, 1, 0xDC00, 1}, + {5, 10, 1, 1, 0xDFFF, 1}, + + {5, 10, 2, 3, 0xD800, 2}, + {5, 10, 2, 3, 0xDBFF, 2}, + {5, 10, 2, 3, 0xDC00, 2}, + {5, 10, 2, 3, 0xDFFF, 2}, + + // dont replace anything, just show the surrogate pair + {5, 10, 3, 6, u'b', 0}, + + // make the leading surrogate a trailing one + {5, 10, 3, 6, 0xDC00, 3}, + {5, 10, 3, 6, 0xDFFF, 3}, + + // make the trailing surrogate a leading one + {5, 10, 3, 6, 0xD800, 4}, + {5, 10, 3, 6, 0xDBFF, 4}, + + // make the trailing surrogate a BMP char + {5, 10, 3, 6, u'z', 4}, + + {5, 7, 3, 6, u'b', 0}, // no space for fourth CP + {5, 8, 3, 6, u'b', 0}, // no space for fourth CP + {5, 9, 3, 6, u'b', 0}, // no space for fourth CP + + {4, 10, 3, 6, u'b', 0}, // incomplete fourth CP + {4, 7, 3, 6, u'b', 0}, // incomplete fourth CP, and no space for it + {4, 8, 3, 6, u'b', 0}, // incomplete fourth CP, and no space for it + {4, 9, 3, 6, u'b', 0}, // incomplete fourth CP, and no space for it + + }; + + for (auto t : offsets) + { + CharT in[array_size (valid_in)] = {}; + char out[array_size (exp) - 1] = {}; + VERIFY (t.in_size <= array_size (in)); + VERIFY (t.out_size <= array_size (out)); + VERIFY (t.expected_in_next <= t.in_size); + VERIFY (t.expected_out_next <= t.out_size); + copy (begin (valid_in), end (valid_in), begin (in)); + in[t.replace_pos] = t.replace_char; + + auto state = mbstate_t{}; + auto in_next = (const CharT *) nullptr; + auto out_next = (char *) nullptr; + auto res = codecvt_base::result (); + + res = cvt.out (state, in, in + t.in_size, in_next, out, out + t.out_size, + out_next); + VERIFY (res == cvt.error); + VERIFY (in_next == in + t.expected_in_next); + VERIFY (out_next == out + t.expected_out_next); + VERIFY (char_traits::compare (out, exp, t.expected_out_next) == 0); + if (t.expected_out_next < array_size (out)) + VERIFY (out[t.expected_out_next] == 0); + } +} + +template +void +ucs2_to_utf8_out (const std::codecvt &cvt) +{ + ucs2_to_utf8_out_ok (cvt); + ucs2_to_utf8_out_partial (cvt); + ucs2_to_utf8_out_error (cvt); +} + +template +void +test_utf8_ucs2_cvts (const std::codecvt &cvt) +{ + utf8_to_ucs2_in (cvt); + ucs2_to_utf8_out (cvt); +} diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc new file mode 100644 index 00000000000..169504939a2 --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_unicode_wchar_t.cc @@ -0,0 +1,59 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include "codecvt_unicode.h" + +#include + +using namespace std; + +void +test_utf8_utf32_codecvts () +{ +#if __SIZEOF_WCHAR_T__ == 4 + auto cvt_ptr = to_unique_ptr (new codecvt_utf8 ()); + test_utf8_utf32_codecvts (*cvt_ptr); +#endif +} + +void +test_utf8_utf16_codecvts () +{ +#if __SIZEOF_WCHAR_T__ >= 2 + auto cvt_ptr = to_unique_ptr (new codecvt_utf8_utf16 ()); + test_utf8_utf16_cvts (*cvt_ptr); +#endif +} + +void +test_utf8_ucs2_codecvts () +{ +#if __SIZEOF_WCHAR_T__ == 2 + auto cvt_ptr = to_unique_ptr (new codecvt_utf8 ()); + test_utf8_ucs2_cvts (*cvt_ptr); +#endif +} + +int +main () +{ + test_utf8_utf32_codecvts (); + test_utf8_utf16_codecvts (); + test_utf8_ucs2_codecvts (); +}