commit c5bbc9258a7182e14eb731e5251842bc417b5822 Author: Jonathan Wakely Date: Fri Mar 10 20:12:09 2017 +0000 PR libstdc++/79511 fix endianness of UTF-16 data PR libstdc++/79511 * src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff as a surrogate pair. (__codecvt_utf8_utf16_base::do_in): Use native endianness for internal representation. (__codecvt_utf8_utf16_base::do_in): Likewise. * testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc: New test. diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc index 12a4d4f..9b63e2b 100644 --- a/libstdc++-v3/src/c++11/codecvt.cc +++ b/libstdc++-v3/src/c++11/codecvt.cc @@ -315,7 +315,7 @@ namespace { static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit"); - if (codepoint < max_single_utf16_unit) + if (codepoint <= max_single_utf16_unit) { if (to.size() > 0) { @@ -1341,7 +1341,11 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, { range from{ __from, __from_end }; range to{ __to, __to_end }; - auto res = utf16_in(from, to, _M_maxcode, _M_mode); + codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); +#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ + mode = codecvt_mode(mode | little_endian); +#endif + auto res = utf16_in(from, to, _M_maxcode, mode); __from_next = from.next; __to_next = to.next; return res; @@ -1411,7 +1415,11 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end, { range from{ __from, __from_end }; range to{ __to, __to_end }; - auto res = utf16_in(from, to, _M_maxcode, _M_mode); + codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header)); +#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ + mode = codecvt_mode(mode | little_endian); +#endif + auto res = utf16_in(from, to, _M_maxcode, mode); __from_next = from.next; __to_next = to.next; return res; diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc new file mode 100644 index 0000000..5555bcb --- /dev/null +++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc @@ -0,0 +1,60 @@ +// Copyright (C) 2017 Free Software Foundation, Inc. +// +// This file is part of the GNU ISO C++ Library. This library is free +// software; you can redistribute it and/or modify it under the +// terms of the GNU General Public License as published by the +// Free Software Foundation; either version 3, or (at your option) +// any later version. + +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License along +// with this library; see the file COPYING3. If not see +// . + +// { dg-do run { target c++11 } } + +#include +#include +#include + +// PR libstdc++/79511 + +template + std::basic_string conv(const char* src) + { + std::wstring_convert, ElemT> conv; + return conv.from_bytes(src); + } + +void +test01() +{ + static char const src[] = "\xEF\xBF\xBF"; + VERIFY( conv(src) == u"\xffff" ); + VERIFY( conv(src) == U"\xffff" ); +#ifdef _GLIBCXX_USE_WCHAR_T + VERIFY( conv(src) == L"\xffff" ); +#endif +} + +void +test02() +{ + static char const src[] = "\xE2\x82\xAC"; + VERIFY( conv(src) == u"\x20ac" ); + VERIFY( conv(src) == U"\x20ac" ); +#ifdef _GLIBCXX_USE_WCHAR_T + VERIFY( conv(src) == L"\x20ac" ); +#endif +} + +int +main() +{ + test01(); + test02(); +}