public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] Various fixes for <codecvt> facets
@ 2017-03-13 19:36 Jonathan Wakely
  2017-03-14 18:46 ` Jonathan Wakely
  0 siblings, 1 reply; 5+ messages in thread
From: Jonathan Wakely @ 2017-03-13 19:36 UTC (permalink / raw)
  To: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1172 bytes --]

This is a series of patches to fix various bugs in the Unicode
character conversion facets.

Ther first patch fixes a silly < versus <= bug that meant that 0xffff
got written as a surrogate pair instead of as simply 0xff, and an
endianness bug for the internal representation of UTF-16 code units
stored in char32_t or wchar_t values. That's PR 79511.

The second patch fixes some incorrect bitwise operations (because I
confused & and |) and some incorrect limits (because I confused max
and min). That fixes determining the endianness of the external
representation bytes when they start with a Byte OrderMark, and
correctly reports errors on invalid UCS2. It also fixes
wstring_convert so that it reports the number of characters that were
converted prior to an error. That's PR 79980.

The third patch fixes the output of the encoding() and max_length()
member functions on the codecvt facets, because I wasn't correctly
accounting for a BOM or for the differences between UTF-16 and UCS2.

I plan to commit these for all branches, but I'll wait until after GCC
7.1 is released, and fix it for 7.2 instead. These bugs aren't
important enough to rush into trunk now.



[-- Attachment #2: 79511.patch --]
[-- Type: text/plain, Size: 4020 bytes --]

commit c5bbc9258a7182e14eb731e5251842bc417b5822
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Fri Mar 10 20:12:09 2017 +0000

    PR libstdc++/79511 fix endianness of UTF-16 data
    
    	PR libstdc++/79511
    	* src/c++11/codecvt.cc (write_utf16_code_point): Don't write 0xffff
    	as a surrogate pair.
    	(__codecvt_utf8_utf16_base<char32_t>::do_in): Use native endianness
    	for internal representation.
    	(__codecvt_utf8_utf16_base<wchar_t>::do_in): Likewise.
    	* testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc: New test.

diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index 12a4d4f..9b63e2b 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -315,7 +315,7 @@ namespace
   {
     static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
 
-    if (codepoint < max_single_utf16_unit)
+    if (codepoint <= max_single_utf16_unit)
       {
 	if (to.size() > 0)
 	  {
@@ -1341,7 +1341,11 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 {
   range<const char> from{ __from, __from_end };
   range<char32_t> to{ __to, __to_end };
-  auto res = utf16_in(from, to, _M_maxcode, _M_mode);
+  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
+#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
+  mode = codecvt_mode(mode | little_endian);
+#endif
+  auto res = utf16_in(from, to, _M_maxcode, mode);
   __from_next = from.next;
   __to_next = to.next;
   return res;
@@ -1411,7 +1415,11 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 {
   range<const char> from{ __from, __from_end };
   range<wchar_t> to{ __to, __to_end };
-  auto res = utf16_in(from, to, _M_maxcode, _M_mode);
+  codecvt_mode mode = codecvt_mode(_M_mode & (consume_header|generate_header));
+#if __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__
+  mode = codecvt_mode(mode | little_endian);
+#endif
+  auto res = utf16_in(from, to, _M_maxcode, mode);
   __from_next = from.next;
   __to_next = to.next;
   return res;
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc
new file mode 100644
index 0000000..5555bcb
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/79511.cc
@@ -0,0 +1,60 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+// PR libstdc++/79511
+
+template<typename ElemT>
+  std::basic_string<ElemT> conv(const char* src)
+  {
+    std::wstring_convert<std::codecvt_utf8_utf16<ElemT>, ElemT> conv;
+    return conv.from_bytes(src);
+  }
+
+void
+test01()
+{
+  static char const src[] = "\xEF\xBF\xBF";
+  VERIFY( conv<char16_t>(src) == u"\xffff" );
+  VERIFY( conv<char32_t>(src) == U"\xffff" );
+#ifdef _GLIBCXX_USE_WCHAR_T
+  VERIFY( conv<wchar_t>(src) == L"\xffff" );
+#endif
+}
+
+void
+test02()
+{
+  static char const src[] = "\xE2\x82\xAC";
+  VERIFY( conv<char16_t>(src) == u"\x20ac" );
+  VERIFY( conv<char32_t>(src) == U"\x20ac" );
+#ifdef _GLIBCXX_USE_WCHAR_T
+  VERIFY( conv<wchar_t>(src) == L"\x20ac" );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+}


[-- Attachment #3: 79980.patch --]
[-- Type: text/plain, Size: 16206 bytes --]

commit dc9f4c953aa1600978877a90763122f0104e6c4c
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Sat Mar 11 03:39:30 2017 +0000

    PR libstdc++/79980 fix BOM detection, maxcode checks, UCS2 handling
    
    	PR libstdc++/79980
    	* include/bits/locale_conv.h (__do_str_codecvt): Set __count on
    	error path.
    	* src/c++11/codecvt.cc (operator&=, operator|=, operator~): Overloads
    	for manipulating codecvt_mode values.
    	(read_utf16_bom): Compare input to BOM constants instead of integral
    	constants that depend on endianness.  Take mode parameter by
    	reference and adjust it, to distinguish between no BOM present and
    	UTF-16BE BOM present.
    	(ucs4_in, ucs2_span, ucs4_span): Adjust calls to read_utf16_bom.
    	(surrogates): New enumeration type.
    	(utf16_in, utf16_out): Add surrogates parameter to choose between
    	UTF-16 and UCS2 behaviour.
    	(utf16_span, ucs2_span): Use std::min not std::max.
    	(ucs2_out): Use std::min not std::max.  Disallow surrogate pairs.
    	(ucs2_in): Likewise. Adjust calls to read_utf16_bom.
    	* testsuite/22_locale/codecvt/codecvt_utf16/79980.cc: New test.
    	* testsuite/22_locale/codecvt/codecvt_utf8/79980.cc: New test.

diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h
index cd8f146..9b952d4 100644
--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -81,7 +81,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	     && (__outstr.size() - __outchars) < __maxlen);
 
       if (__result == codecvt_base::error)
-	return false;
+	{
+	  __count = __next - __first;
+	  return false;
+	}
 
       if (__result == codecvt_base::noconv)
 	{
diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index 9b63e2b..a50804c 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -24,13 +24,27 @@
 
 #include <codecvt>
 #include <cstring>		// std::memcpy, std::memcmp
-#include <bits/stl_algobase.h>	// std::max
+#include <bits/stl_algobase.h>	// std::min
 
 #ifdef _GLIBCXX_USE_C99_STDINT_TR1
 namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
+  // The standard doesn't define these operators, which is annoying.
+  static underlying_type<codecvt_mode>::type
+  to_integer(codecvt_mode m)
+  { return static_cast<mode_t>(m); }
+
+  static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
+  { return m = codecvt_mode(to_integer(m) & to_integer(n)); }
+
+  static codecvt_mode& operator|=(codecvt_mode& m, codecvt_mode n)
+  { return m = codecvt_mode(to_integer(m) | to_integer(n)); }
+
+  static codecvt_mode operator~(codecvt_mode m)
+  { return codecvt_mode(~to_integer(m)); }
+
 namespace
 {
   // Largest code point that fits in a single UTF-16 code unit.
@@ -117,22 +131,26 @@ namespace
       read_bom(from, utf8_bom);
   }
 
-  // If consume_header is set in mode update from.next to after any BOM.
-  // Return little_endian iff the UTF-16LE BOM was present.
-  codecvt_mode
-  read_utf16_bom(range<const char16_t>& from, codecvt_mode mode)
+  // If consume_header is not set in mode, no effects.
+  // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
+  // - if the UTF-16BE BOM was found unset little_endian in mode, or
+  // - if the UTF-16LE BOM was found set little_endian in mode.
+  void
+  read_utf16_bom(range<const char16_t>& from, codecvt_mode& mode)
   {
     if (mode & consume_header && from.size())
       {
-	if (*from.next == 0xFEFF)
-	  ++from.next;
-	else if (*from.next == 0xFFFE)
+	if (!memcmp(from.next, utf16_bom, 2))
 	  {
 	    ++from.next;
-	    return little_endian;
+	    mode &= ~little_endian;
+	  }
+	else if (!memcmp(from.next, utf16le_bom, 2))
+	  {
+	    ++from.next;
+	    mode |= little_endian;
 	  }
       }
-    return {};
   }
 
   // Read a codepoint from a UTF-8 multibyte sequence.
@@ -380,8 +398,7 @@ namespace
   ucs4_in(range<const char16_t>& from, range<char32_t>& to,
           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
   {
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
+    read_utf16_bom(from, mode);
     while (from.size() && to.size())
       {
 	const char32_t codepoint = read_utf16_code_point(from, maxcode, mode);
@@ -413,11 +430,15 @@ namespace
     return codecvt_base::ok;
   }
 
-  // utf8 -> utf16
+  // Flag indicating whether to process UTF-16 or UCS2
+  enum class surrogates { allowed, disallowed };
+
+  // utf8 -> utf16 (or utf8 -> ucs2 if s == surrogates::disallowed)
   template<typename C>
   codecvt_base::result
   utf16_in(range<const char>& from, range<C>& to,
-           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+	   unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+	   surrogates s = surrogates::allowed)
   {
     read_utf8_bom(from, mode);
     while (from.size() && to.size())
@@ -425,7 +446,12 @@ namespace
 	const char* const first = from.next;
 	const char32_t codepoint = read_utf8_code_point(from, maxcode);
 	if (codepoint == incomplete_mb_character)
-	  return codecvt_base::partial;
+	  {
+	    if (s == surrogates::allowed)
+	      return codecvt_base::partial;
+	    else
+	      return codecvt_base::error; // No surrogates in UCS2
+	  }
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
 	if (!write_utf16_code_point(to, codepoint, mode))
@@ -437,11 +463,12 @@ namespace
     return codecvt_base::ok;
   }
 
-  // utf16 -> utf8
+  // utf16 -> utf8 (or ucs2 -> utf8 if s == surrogates::disallowed)
   template<typename C>
   codecvt_base::result
   utf16_out(range<const C>& from, range<char>& to,
-            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
+	    unsigned long maxcode = max_code_point, codecvt_mode mode = {},
+	    surrogates s = surrogates::allowed)
   {
     if (!write_utf8_bom(to, mode))
       return codecvt_base::partial;
@@ -451,6 +478,9 @@ namespace
 	int inc = 1;
 	if (is_high_surrogate(c))
 	  {
+	    if (s == surrogates::disallowed)
+	      return codecvt_base::error; // No surrogates in UCS-2
+
 	    if (from.size() < 2)
 	      return codecvt_base::ok; // stop converting at this point
 
@@ -492,7 +522,7 @@ namespace
 	++count;
       }
     if (count+1 == max) // take one more character if it fits in a single unit
-      read_utf8_code_point(from, std::max(max_single_utf16_unit, maxcode));
+      read_utf8_code_point(from, std::min(max_single_utf16_unit, maxcode));
     return from.next;
   }
 
@@ -501,7 +531,9 @@ namespace
   ucs2_in(range<const char>& from, range<char16_t>& to,
 	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
-    return utf16_in(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
+    return utf16_in(from, to, maxcode, mode, surrogates::disallowed);
   }
 
   // ucs2 -> utf8
@@ -509,7 +541,9 @@ namespace
   ucs2_out(range<const char16_t>& from, range<char>& to,
 	   char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
-    return utf16_out(from, to, std::max(max_single_utf16_unit, maxcode), mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
+    return utf16_out(from, to, maxcode, mode, surrogates::disallowed);
   }
 
   // ucs2 -> utf16
@@ -537,14 +571,14 @@ namespace
   ucs2_in(range<const char16_t>& from, range<char16_t>& to,
 	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    read_utf16_bom(from, mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
     while (from.size() && to.size())
       {
 	const char32_t c = read_utf16_code_point(from, maxcode, mode);
 	if (c == incomplete_mb_character)
-	  return codecvt_base::partial;
+	  return codecvt_base::error; // UCS-2 only supports single units.
 	if (c > maxcode)
 	  return codecvt_base::error;
 	*to.next++ = c;
@@ -557,9 +591,9 @@ namespace
             char32_t maxcode, codecvt_mode mode)
   {
     range<const char16_t> from{ begin, end };
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    read_utf16_bom(from, mode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
     char32_t c = 0;
     while (max-- && c <= maxcode)
       c = read_utf16_code_point(from, maxcode, mode);
@@ -572,7 +606,8 @@ namespace
   {
     range<const char> from{ begin, end };
     read_utf8_bom(from, mode);
-    maxcode = std::max(max_single_utf16_unit, maxcode);
+    // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
+    maxcode = std::min(max_single_utf16_unit, maxcode);
     char32_t c = 0;
     while (max-- && c <= maxcode)
       c = read_utf8_code_point(from, maxcode);
@@ -598,8 +633,7 @@ namespace
             char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
     range<const char16_t> from{ begin, end };
-    if (read_utf16_bom(from, mode) == little_endian)
-      mode = codecvt_mode(mode & little_endian);
+    read_utf16_bom(from, mode);
     char32_t c = 0;
     while (max-- && c <= maxcode)
       c = read_utf16_code_point(from, maxcode, mode);
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
new file mode 100644
index 0000000..9383818
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
@@ -0,0 +1,115 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+// PR libstdc++/79980
+
+constexpr std::codecvt_mode mode(std::codecvt_mode m)
+{ return static_cast<std::codecvt_mode>(m | std::consume_header); }
+
+template<typename WCh, unsigned long Max = 0x10FFFF,
+	 std::codecvt_mode Mode = std::consume_header>
+  using Conv
+    = std::wstring_convert<std::codecvt_utf16<WCh, Max, mode(Mode)>, WCh>;
+
+void
+test01()
+{
+  const char src[] = "\xFE\xFF\xAB\xCD";
+  Conv<char16_t> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test02()
+{
+  const char src[] = "\xFF\xFE\xAB\xCD";
+  Conv<char16_t> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test03()
+{
+  const char src[] = "\xFE\xFF\xAB\xCD";
+  Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xabcd );
+}
+
+void
+test04()
+{
+  const char src[] = "\xFF\xFE\xAB\xCD";
+  Conv<char16_t, 0x10FFFF, std::little_endian> conv;
+  auto dst = conv.from_bytes(src, src+4);
+  VERIFY( dst[0] == 0xcdab );
+}
+
+void
+test05()
+{
+  const char src[] = "\0\x61\xAB\xCD"; // character greater than 0x00FF
+  Conv<char16_t, 0xFF> conv("to_bytes failed", u"from_bytes failed");
+  std::u16string result = conv.from_bytes(src, src+4);
+  VERIFY( result == u"from_bytes failed" );
+  VERIFY( conv.converted() == 2 );
+}
+
+void
+test06()
+{
+  const char src[] = "\0\x61\xAB\xCD";
+  Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+  std::u16string result = conv.from_bytes(src, src+3); // incomplete character
+  VERIFY( result == u"from_bytes failed" );
+  VERIFY( conv.converted() == 2 );
+}
+
+void
+test07()
+{
+  Conv<char16_t> conv("to_bytes failed", u"from_bytes failed");
+  // ucs2 to utf-16 conversion should fail on invalid ucs2 input:
+  std::u16string utf16 = u"1234\U00001111\U0001ffff";
+  auto out = conv.to_bytes(utf16);
+  VERIFY( out == "to_bytes failed" );
+  VERIFY( conv.converted() == 5 );
+
+  // And should also fail on incomplete surrogate pair (not return partial):
+  out = conv.to_bytes(utf16.substr(0, utf16.size()-1));
+  VERIFY( out == "to_bytes failed" );
+  VERIFY( conv.converted() == 5 );
+}
+
+int main()
+{
+  test01();
+  test02();
+  test03();
+  test04();
+  test05();
+  test06();
+  test07();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc
new file mode 100644
index 0000000..1251acb
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/79980.cc
@@ -0,0 +1,94 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <locale>
+#include <string>
+#include <testsuite_hooks.h>
+
+using std::wstring_convert;
+using std::codecvt_utf8;
+
+void
+test01()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char16_t>, char16_t> c("bad", u"BAD");
+
+  // utf-8 to ucs2 conversion should fail on character outside BMP
+  auto ucs2 = c.from_bytes(src);
+  VERIFY( ucs2 == u"BAD" );
+  VERIFY( c.converted() == 7 );
+
+  // ucs2 to utf-8 conversion should fail on invalid ucs2 input:
+  std::u16string utf16 = u"1234\U00001111\U0001ffff";
+  auto out = c.to_bytes(utf16);
+  VERIFY( out == "bad" );
+  VERIFY( c.converted() == 5 );
+
+  // And should also fail on incomplete surrogate pair (not return partial):
+  out = c.to_bytes(utf16.substr(0, utf16.size()-1));
+  VERIFY( out == "bad" );
+  VERIFY( c.converted() == 5 );
+}
+
+void
+test02()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char16_t, 0x1000>, char16_t> c("bad", u"BAD");
+
+  // utf-8 to ucs2 conversion should fail on character above Maxcode=0x1000
+  auto ucs2 = c.from_bytes(src);
+  VERIFY( ucs2 == u"BAD" );
+  VERIFY( c.converted() == 4 );
+}
+
+void
+test03()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char32_t, 0x10000>, char32_t> c("bad", U"BAD");
+
+  // utf-8 to ucs4 conversion should fail on character above Maxcode=0x10000
+  auto ucs4 = c.from_bytes(src);
+  VERIFY( ucs4 == U"BAD" );
+  VERIFY( c.converted() == 7 );
+}
+
+void
+test04()
+{
+  std::string src = u8"1234\U00001111\U0001ffff";
+  wstring_convert<codecvt_utf8<char32_t, 0x1000>, char32_t> c("bad", U"BAD");
+
+  // utf-8 to ucs4 conversion should fail on character above Maxcode=0x1000
+  auto ucs4 = c.from_bytes(src);
+  VERIFY( ucs4 == U"BAD" );
+  VERIFY( c.converted() == 4 );
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+  test04();
+}


[-- Attachment #4: codecvt_max_length.patch --]
[-- Type: text/plain, Size: 18570 bytes --]

commit 1618fc19cba68e26def23cdf9ad980fa5e672683
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Sat Mar 11 14:15:38 2017 +0000

    Fix encoding() and max_length() values for codecvt facets
    
    	* src/c++11/codecvt.cc (codecvt<char16_t, char, mbstate_t>)
    	(codecvt<char32_t, char, mbstate_t>, __codecvt_utf8_base<char16_t>)
    	(__codecvt_utf8_base<char32_t>, __codecvt_utf8_base<wchar_t>)
    	(__codecvt_utf16_base<char16_t>, __codecvt_utf16_base<char32_t>)
    	(__codecvt_utf16_base<wchar_t>, __codecvt_utf8_utf16_base<char16_t>)
    	(__codecvt_utf8_utf16_base<char32_t>)
    	(__codecvt_utf8_utf16_base<wchar_t>): Fix do_encoding() and
    	do_max_length() return values.
    	* testsuite/22_locale/codecvt/codecvt_utf16/members.cc: New test.
    	* testsuite/22_locale/codecvt/codecvt_utf8/members.cc: New test.
    	* testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc: New test.

diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index a50804c..9c91725 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -72,8 +72,8 @@ namespace
 
   // Multibyte sequences can have "header" consisting of Byte Order Mark
   const unsigned char utf8_bom[3] = { 0xEF, 0xBB, 0xBF };
-  const unsigned char utf16_bom[4] = { 0xFE, 0xFF };
-  const unsigned char utf16le_bom[4] = { 0xFF, 0xFE };
+  const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
+  const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
 
   template<size_t N>
     inline bool
@@ -695,7 +695,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 codecvt<char16_t, char, mbstate_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 codecvt<char16_t, char, mbstate_t>::do_always_noconv() const throw()
@@ -713,9 +713,9 @@ do_length(state_type&, const extern_type* __from,
 int
 codecvt<char16_t, char, mbstate_t>::do_max_length() const throw()
 {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character (one or two UTF-16 code units) requires
+  // up to four UTF-8 code units.
+  return 4;
 }
 
 // Define members of codecvt<char32_t, char, mbstate_t> specialization.
@@ -766,7 +766,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 codecvt<char32_t, char, mbstate_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 codecvt<char32_t, char, mbstate_t>::do_always_noconv() const throw()
@@ -783,7 +783,11 @@ do_length(state_type&, const extern_type* __from,
 
 int
 codecvt<char32_t, char, mbstate_t>::do_max_length() const throw()
-{ return 4; }
+{
+  // A single character (one UTF-32 code unit) requires
+  // up to 4 UTF-8 code units.
+  return 4;
+}
 
 // Define members of codecvt_utf8<char16_t> base class implementation.
 // Converts from UTF-8 to UCS-2.
@@ -835,7 +839,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_base<char16_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_base<char16_t>::do_always_noconv() const throw()
@@ -852,7 +856,14 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf8_base<char16_t>::do_max_length() const throw()
-{ return 3; }
+{
+  // A single UCS-2 character requires up to three UTF-8 code units.
+  // (UCS-2 cannot represent characters that use four UTF-8 code units).
+  int max = 3;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
+}
 
 // Define members of codecvt_utf8<char32_t> base class implementation.
 // Converts from UTF-8 to UTF-32 (aka UCS-4).
@@ -900,7 +911,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_base<char32_t>::do_always_noconv() const throw()
@@ -917,7 +928,13 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf8_base<char32_t>::do_max_length() const throw()
-{ return 4; }
+{
+  // A single UCS-4 character requires up to four UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
+}
 
 #ifdef _GLIBCXX_USE_WCHAR_T
 // Define members of codecvt_utf8<wchar_t> base class implementation.
@@ -992,7 +1009,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_base<wchar_t>::do_always_noconv() const throw()
@@ -1015,7 +1032,16 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf8_base<wchar_t>::do_max_length() const throw()
-{ return 4; }
+{
+#if __SIZEOF_WCHAR_T__ == 2
+  int max = 3; // See __codecvt_utf8_base<char16_t>::do_max_length()
+#else
+  int max = 4; // See __codecvt_utf8_base<char32_t>::do_max_length()
+#endif
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
+}
 #endif
 
 // Define members of codecvt_utf16<char16_t> base class implementation.
@@ -1070,7 +1096,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf16_base<char16_t>::do_encoding() const throw()
-{ return 1; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
 
 bool
 __codecvt_utf16_base<char16_t>::do_always_noconv() const throw()
@@ -1089,7 +1115,14 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf16_base<char16_t>::do_max_length() const throw()
-{ return 3; }
+{
+  // A single UCS-2 character requires one UTF-16 code unit (so two chars).
+  // (UCS-2 cannot represent characters that use multiple UTF-16 code units).
+  int max = 2;
+  if (_M_mode & consume_header)
+    max += sizeof(utf16_bom);
+  return max;
+}
 
 // Define members of codecvt_utf16<char32_t> base class implementation.
 // Converts from UTF-16 to UTF-32 (aka UCS-4).
@@ -1143,7 +1176,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf16_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
 
 bool
 __codecvt_utf16_base<char32_t>::do_always_noconv() const throw()
@@ -1162,7 +1195,14 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf16_base<char32_t>::do_max_length() const throw()
-{ return 4; }
+{
+  // A single UCS-4 character requires one or two UTF-16 code units
+  // (so up to four chars).
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf16_bom);
+  return max;
+}
 
 #ifdef _GLIBCXX_USE_WCHAR_T
 // Define members of codecvt_utf16<wchar_t> base class implementation.
@@ -1237,7 +1277,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf16_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-16 is not a fixed-width encoding
 
 bool
 __codecvt_utf16_base<wchar_t>::do_always_noconv() const throw()
@@ -1261,7 +1301,16 @@ do_length(state_type&, const extern_type* __from,
 
 int
 __codecvt_utf16_base<wchar_t>::do_max_length() const throw()
-{ return 4; }
+{
+#if __SIZEOF_WCHAR_T__ == 2
+  int max = 2; // See __codecvt_utf16_base<char16_t>::do_max_length()
+#else
+  int max = 4; // See __codecvt_utf16_base<char32_t>::do_max_length()
+#endif
+  if (_M_mode & consume_header)
+    max += sizeof(utf16_bom);
+  return max;
+}
 #endif
 
 // Define members of codecvt_utf8_utf16<char16_t> base class implementation.
@@ -1314,7 +1363,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_utf16_base<char16_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_utf16_base<char16_t>::do_always_noconv() const throw()
@@ -1332,9 +1381,12 @@ do_length(state_type&, const extern_type* __from,
 int
 __codecvt_utf8_utf16_base<char16_t>::do_max_length() const throw()
 {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character can be 1 or 2 UTF-16 code units,
+  // requiring up to 4 UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
 }
 
 // Define members of codecvt_utf8_utf16<char32_t> base class implementation.
@@ -1387,7 +1439,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_utf16_base<char32_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_utf16_base<char32_t>::do_always_noconv() const throw()
@@ -1405,9 +1457,12 @@ do_length(state_type&, const extern_type* __from,
 int
 __codecvt_utf8_utf16_base<char32_t>::do_max_length() const throw()
 {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character can be 1 or 2 UTF-16 code units,
+  // requiring up to 4 UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
 }
 
 #ifdef _GLIBCXX_USE_WCHAR_T
@@ -1461,7 +1516,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 
 int
 __codecvt_utf8_utf16_base<wchar_t>::do_encoding() const throw()
-{ return 0; }
+{ return 0; } // UTF-8 is not a fixed-width encoding
 
 bool
 __codecvt_utf8_utf16_base<wchar_t>::do_always_noconv() const throw()
@@ -1479,9 +1534,12 @@ do_length(state_type&, const extern_type* __from,
 int
 __codecvt_utf8_utf16_base<wchar_t>::do_max_length() const throw()
 {
-  // Any valid UTF-8 sequence of 3 bytes fits in a single 16-bit code unit,
-  // whereas 4 byte sequences require two 16-bit code units.
-  return 3;
+  // A single character can be 1 or 2 UTF-16 code units,
+  // requiring up to 4 UTF-8 code units.
+  int max = 4;
+  if (_M_mode & consume_header)
+    max += sizeof(utf8_bom);
+  return max;
 }
 #endif
 
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
index b40fc65..3288e77 100644
--- a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
@@ -34,7 +34,7 @@ test01()
   const codecvt_c16* const cvt = &use_facet<codecvt_c16>(loc_c);
 
   VERIFY(!cvt->always_noconv());
-  VERIFY(cvt->max_length() == 3);
+  VERIFY(cvt->max_length() == 4);
   VERIFY(cvt->encoding() == 0);
 
   const char u8dat[] = u8"H\U000000E4ll\U000000F6 \U0001F63F \U000056FD "
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc
new file mode 100644
index 0000000..993c860
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/members.cc
@@ -0,0 +1,81 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 2; // UTF-16 BOM is 16 bits
+
+void
+test01()
+{
+  const int maxlen = 2;
+
+  std::codecvt_utf16<char16_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf16<char16_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+  const int maxlen = 4;
+
+  std::codecvt_utf16<char32_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf16<char32_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  const int maxlen = sizeof(wchar_t) == 4 ? 4 : 2;
+
+  std::codecvt_utf16<wchar_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf16<wchar_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc
new file mode 100644
index 0000000..baeb049
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8/members.cc
@@ -0,0 +1,81 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 3; // UTF-8 BOM is 24 bits
+
+void
+test01()
+{
+  const int maxlen = 3;
+
+  std::codecvt_utf8<char16_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8<char16_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+  const int maxlen = 4;
+
+  std::codecvt_utf8<char32_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8<char32_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  const int maxlen = sizeof(wchar_t) == 4 ? 4 : 3;
+
+  std::codecvt_utf8<wchar_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8<wchar_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc
new file mode 100644
index 0000000..8fcdfff
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf8_utf16/members.cc
@@ -0,0 +1,76 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+const int bomlen = 3; // UTF-8 BOM is 24 bits
+const int maxlen = 4;
+
+void
+test01()
+{
+  std::codecvt_utf8_utf16<char16_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8_utf16<char16_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test02()
+{
+  std::codecvt_utf8_utf16<char32_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8_utf16<char32_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  std::codecvt_utf8_utf16<wchar_t> c;
+  VERIFY( c.always_noconv() == false );
+  VERIFY( c.encoding() == 0 );
+  VERIFY( c.max_length() == maxlen );
+
+  std::codecvt_utf8_utf16<wchar_t, 0x10ffff, std::consume_header> c_bom;
+  VERIFY( c_bom.always_noconv() == false );
+  VERIFY( c_bom.encoding() == 0 );
+  VERIFY( c_bom.max_length() == (maxlen + bomlen) );
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Various fixes for <codecvt> facets
  2017-03-13 19:36 [PATCH] Various fixes for <codecvt> facets Jonathan Wakely
@ 2017-03-14 18:46 ` Jonathan Wakely
  2017-03-16 15:23   ` Jonathan Wakely
  0 siblings, 1 reply; 5+ messages in thread
From: Jonathan Wakely @ 2017-03-14 18:46 UTC (permalink / raw)
  To: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 1477 bytes --]

On 13/03/17 19:35 +0000, Jonathan Wakely wrote:
>This is a series of patches to fix various bugs in the Unicode
>character conversion facets.
>
>Ther first patch fixes a silly < versus <= bug that meant that 0xffff
>got written as a surrogate pair instead of as simply 0xff, and an
>endianness bug for the internal representation of UTF-16 code units
>stored in char32_t or wchar_t values. That's PR 79511.
>
>The second patch fixes some incorrect bitwise operations (because I
>confused & and |) and some incorrect limits (because I confused max
>and min). That fixes determining the endianness of the external
>representation bytes when they start with a Byte OrderMark, and
>correctly reports errors on invalid UCS2. It also fixes
>wstring_convert so that it reports the number of characters that were
>converted prior to an error. That's PR 79980.
>
>The third patch fixes the output of the encoding() and max_length()
>member functions on the codecvt facets, because I wasn't correctly
>accounting for a BOM or for the differences between UTF-16 and UCS2.
>
>I plan to commit these for all branches, but I'll wait until after GCC
>7.1 is released, and fix it for 7.2 instead. These bugs aren't
>important enough to rush into trunk now.

One more patch for a problem found by the libc++ testsuite. Now we
pass all the libc++ tests, and we even pass a test that libc++ fails.
With this, I hope our <codecvt> is 100% conforming. Just in time to be
deprecated for C++17 :-)



[-- Attachment #2: patch.txt --]
[-- Type: text/x-patch, Size: 5020 bytes --]

commit 3118704bc37cd771b9fc5bf83230f38a16a7c5c3
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Tue Mar 14 17:47:12 2017 +0000

    PR libstdc++/80041 fix codecvt_utf16<wchar_t> to use UTF-16 not UTF-8
    
    	PR libstdc++/80041
    	* src/c++11/codecvt.cc (__codecvt_utf16_base<wchar_t>::do_out)
    	(__codecvt_utf16_base<wchar_t>::do_in): Convert char arguments to
    	char16_t to work with UTF-16 instead of UTF-8.
    	* testsuite/22_locale/codecvt/codecvt_utf16/80041.cc: New test.

diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index 9c91725..ef38267 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -1217,7 +1217,10 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
        extern_type* __to, extern_type* __to_end,
        extern_type*& __to_next) const
 {
-  range<char> to{ __to, __to_end };
+  range<char16_t> to{
+    reinterpret_cast<char16_t*>(__to),
+    reinterpret_cast<char16_t*>(__to_end)
+  };
 #if __SIZEOF_WCHAR_T__ == 2
   range<const char16_t> from{
     reinterpret_cast<const char16_t*>(__from),
@@ -1234,7 +1237,7 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
   return codecvt_base::error;
 #endif
   __from_next = reinterpret_cast<const wchar_t*>(from.next);
-  __to_next = to.next;
+  __to_next = reinterpret_cast<char*>(to.next);
   return res;
 }
 
@@ -1254,7 +1257,10 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
       intern_type* __to, intern_type* __to_end,
       intern_type*& __to_next) const
 {
-  range<const char> from{ __from, __from_end };
+  range<const char16_t> from{
+    reinterpret_cast<const char16_t*>(__from),
+    reinterpret_cast<const char16_t*>(__from_end)
+  };
 #if __SIZEOF_WCHAR_T__ == 2
   range<char16_t> to{
     reinterpret_cast<char16_t*>(__to),
@@ -1270,7 +1276,7 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 #else
   return codecvt_base::error;
 #endif
-  __from_next = from.next;
+  __from_next = reinterpret_cast<const char*>(from.next);
   __to_next = reinterpret_cast<wchar_t*>(to.next);
   return res;
 }
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc
new file mode 100644
index 0000000..a78b194
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/80041.cc
@@ -0,0 +1,87 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+void
+test01()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  std::codecvt_utf16<wchar_t> conv;
+  const wchar_t wc = 0x6557;
+  char bytes[2] = {0};
+  const wchar_t* wcnext;
+  std::mbstate_t st{};
+  char* next = nullptr;
+  auto res = conv.out(st, &wc, &wc+ 1, wcnext, bytes, std::end(bytes), next);
+  VERIFY( res == std::codecvt_base::ok );
+  VERIFY( wcnext == &wc + 1 );
+  VERIFY( next == std::end(bytes) );
+  VERIFY( bytes[0] == 0x65 );
+  VERIFY( bytes[1] == 0x57 );
+  VERIFY( conv.length(st, bytes, next, 1) == (next - bytes) );
+
+  wchar_t w;
+  wchar_t* wnext;
+  const char* cnext;
+  st = {};
+  res = conv.in(st, bytes, next, cnext, &w, &w + 1, wnext);
+  VERIFY( res == std::codecvt_base::ok );
+  VERIFY( wnext == &w + 1 );
+  VERIFY( cnext == next );
+  VERIFY( w == wc );
+#endif
+}
+
+void
+test02()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  std::codecvt_utf16<wchar_t, 0x10FFFF, std::little_endian> conv;
+  wchar_t wc = 0x6557;
+  char bytes[2] = {0};
+  const wchar_t* wcnext;
+  std::mbstate_t st{};
+  char* next = nullptr;
+  auto res = conv.out(st, &wc, &wc+ 1, wcnext, bytes, std::end(bytes), next);
+  VERIFY( res == std::codecvt_base::ok );
+  VERIFY( wcnext == &wc + 1 );
+  VERIFY( next == std::end(bytes) );
+  VERIFY( bytes[0] == 0x57 );
+  VERIFY( bytes[1] == 0x65 );
+  VERIFY( conv.length(st, bytes, next, 1) == (next - bytes) );
+
+  wchar_t w;
+  wchar_t* wnext;
+  const char* cnext;
+  st = {};
+  res = conv.in(st, bytes, next, cnext, &w, &w + 1, wnext);
+  VERIFY( res == std::codecvt_base::ok );
+  VERIFY( wnext == &w + 1 );
+  VERIFY( cnext == next );
+  VERIFY( w == wc );
+#endif
+}
+
+int main()
+{
+  test01();
+  test02();
+}

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Various fixes for <codecvt> facets
  2017-03-14 18:46 ` Jonathan Wakely
@ 2017-03-16 15:23   ` Jonathan Wakely
  2017-03-16 17:22     ` Jonathan Wakely
  2017-03-17 19:29     ` Jonathan Wakely
  0 siblings, 2 replies; 5+ messages in thread
From: Jonathan Wakely @ 2017-03-16 15:23 UTC (permalink / raw)
  To: libstdc++, gcc-patches

On 14/03/17 18:46 +0000, Jonathan Wakely wrote:
>On 13/03/17 19:35 +0000, Jonathan Wakely wrote:
>>This is a series of patches to fix various bugs in the Unicode
>>character conversion facets.
>>
>>Ther first patch fixes a silly < versus <= bug that meant that 0xffff
>>got written as a surrogate pair instead of as simply 0xff, and an
>>endianness bug for the internal representation of UTF-16 code units
>>stored in char32_t or wchar_t values. That's PR 79511.
>>
>>The second patch fixes some incorrect bitwise operations (because I
>>confused & and |) and some incorrect limits (because I confused max
>>and min). That fixes determining the endianness of the external
>>representation bytes when they start with a Byte OrderMark, and
>>correctly reports errors on invalid UCS2. It also fixes
>>wstring_convert so that it reports the number of characters that were
>>converted prior to an error. That's PR 79980.
>>
>>The third patch fixes the output of the encoding() and max_length()
>>member functions on the codecvt facets, because I wasn't correctly
>>accounting for a BOM or for the differences between UTF-16 and UCS2.
>>
>>I plan to commit these for all branches, but I'll wait until after GCC
>>7.1 is released, and fix it for 7.2 instead. These bugs aren't
>>important enough to rush into trunk now.
>
>One more patch for a problem found by the libc++ testsuite. Now we
>pass all the libc++ tests, and we even pass a test that libc++ fails.
>With this, I hope our <codecvt> is 100% conforming. Just in time to be
>deprecated for C++17 :-)

I've committed these to trunk, on the basis that they're intended to
be backported to all branches anyway (fixing features that are
currently broken in all branches). There's no point waiting if we plan
to commit them anyway, it would just mean doing an extra backport (5,
6, 7 *and* 8).

Backports will be done soon.


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Various fixes for <codecvt> facets
  2017-03-16 15:23   ` Jonathan Wakely
@ 2017-03-16 17:22     ` Jonathan Wakely
  2017-03-17 19:29     ` Jonathan Wakely
  1 sibling, 0 replies; 5+ messages in thread
From: Jonathan Wakely @ 2017-03-16 17:22 UTC (permalink / raw)
  To: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2210 bytes --]

On 16/03/17 15:23 +0000, Jonathan Wakely wrote:
>On 14/03/17 18:46 +0000, Jonathan Wakely wrote:
>>On 13/03/17 19:35 +0000, Jonathan Wakely wrote:
>>>This is a series of patches to fix various bugs in the Unicode
>>>character conversion facets.
>>>
>>>Ther first patch fixes a silly < versus <= bug that meant that 0xffff
>>>got written as a surrogate pair instead of as simply 0xff, and an
>>>endianness bug for the internal representation of UTF-16 code units
>>>stored in char32_t or wchar_t values. That's PR 79511.
>>>
>>>The second patch fixes some incorrect bitwise operations (because I
>>>confused & and |) and some incorrect limits (because I confused max
>>>and min). That fixes determining the endianness of the external
>>>representation bytes when they start with a Byte OrderMark, and
>>>correctly reports errors on invalid UCS2. It also fixes
>>>wstring_convert so that it reports the number of characters that were
>>>converted prior to an error. That's PR 79980.
>>>
>>>The third patch fixes the output of the encoding() and max_length()
>>>member functions on the codecvt facets, because I wasn't correctly
>>>accounting for a BOM or for the differences between UTF-16 and UCS2.
>>>
>>>I plan to commit these for all branches, but I'll wait until after GCC
>>>7.1 is released, and fix it for 7.2 instead. These bugs aren't
>>>important enough to rush into trunk now.
>>
>>One more patch for a problem found by the libc++ testsuite. Now we
>>pass all the libc++ tests, and we even pass a test that libc++ fails.
>>With this, I hope our <codecvt> is 100% conforming. Just in time to be
>>deprecated for C++17 :-)
>
>I've committed these to trunk, on the basis that they're intended to
>be backported to all branches anyway (fixing features that are
>currently broken in all branches). There's no point waiting if we plan
>to commit them anyway, it would just mean doing an extra backport (5,
>6, 7 *and* 8).
>
>Backports will be done soon.

This fixes a dumb error, where I didn't stop using a "mode_t" typedef
that I introduced at one point and then removed again. It happened to
match a POSIX type on GNU/Linux and worked OK, but not on other
targets.

Fixed like so, committed as obvious.


[-- Attachment #2: patch.txt --]
[-- Type: text/x-patch, Size: 911 bytes --]

commit ce38a7334ea88e8d5fa5685916067cba9e4a7403
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Thu Mar 16 17:16:44 2017 +0000

    PR libstdc++/79980 fix target type of cast
    
    	PR libstdc++/79980
    	* src/c++11/codecvt.cc (to_integer(codecvt_mode)): Fix target type.

diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index ef38267..02866ef 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -34,7 +34,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   // The standard doesn't define these operators, which is annoying.
   static underlying_type<codecvt_mode>::type
   to_integer(codecvt_mode m)
-  { return static_cast<mode_t>(m); }
+  { return static_cast<underlying_type<codecvt_mode>::type>(m); }
 
   static codecvt_mode& operator&=(codecvt_mode& m, codecvt_mode n)
   { return m = codecvt_mode(to_integer(m) & to_integer(n)); }

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Various fixes for <codecvt> facets
  2017-03-16 15:23   ` Jonathan Wakely
  2017-03-16 17:22     ` Jonathan Wakely
@ 2017-03-17 19:29     ` Jonathan Wakely
  1 sibling, 0 replies; 5+ messages in thread
From: Jonathan Wakely @ 2017-03-17 19:29 UTC (permalink / raw)
  To: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 2961 bytes --]

On 16/03/17 15:23 +0000, Jonathan Wakely wrote:
>On 14/03/17 18:46 +0000, Jonathan Wakely wrote:
>>On 13/03/17 19:35 +0000, Jonathan Wakely wrote:
>>>This is a series of patches to fix various bugs in the Unicode
>>>character conversion facets.
>>>
>>>Ther first patch fixes a silly < versus <= bug that meant that 0xffff
>>>got written as a surrogate pair instead of as simply 0xff, and an
>>>endianness bug for the internal representation of UTF-16 code units
>>>stored in char32_t or wchar_t values. That's PR 79511.
>>>
>>>The second patch fixes some incorrect bitwise operations (because I
>>>confused & and |) and some incorrect limits (because I confused max
>>>and min). That fixes determining the endianness of the external
>>>representation bytes when they start with a Byte OrderMark, and
>>>correctly reports errors on invalid UCS2. It also fixes
>>>wstring_convert so that it reports the number of characters that were
>>>converted prior to an error. That's PR 79980.
>>>
>>>The third patch fixes the output of the encoding() and max_length()
>>>member functions on the codecvt facets, because I wasn't correctly
>>>accounting for a BOM or for the differences between UTF-16 and UCS2.
>>>
>>>I plan to commit these for all branches, but I'll wait until after GCC
>>>7.1 is released, and fix it for 7.2 instead. These bugs aren't
>>>important enough to rush into trunk now.
>>
>>One more patch for a problem found by the libc++ testsuite. Now we
>>pass all the libc++ tests, and we even pass a test that libc++ fails.
>>With this, I hope our <codecvt> is 100% conforming. Just in time to be
>>deprecated for C++17 :-)
>
>I've committed these to trunk, on the basis that they're intended to
>be backported to all branches anyway (fixing features that are
>currently broken in all branches). There's no point waiting if we plan
>to commit them anyway, it would just mean doing an extra backport (5,
>6, 7 *and* 8).
>
>Backports will be done soon.

I backported all the recent <codecvt> fixes to gcc-6-branch and it was
failing one test, due to unaligned reads in std::codecvt_utf16. That
type reads UTF-16 data from a const char* (Why narrow characters when
we have char16_t? Because <codecvt> likes to be awkward) and I was
doing that by casting the const char* to const char16_t*. That isn't
safe when the first char isn't aligned correctly for a char16_t.

This patch fixes all the unaligned accesses by abstracting the
operations on the pointers to use new overlaoded operators on the
range<Elem> type. A new partial specialization range<Elem, false>
uses memcpy to read/write char16_t values from the char*, avoiding
alignment problems. The primary template (range<Elem, true>) just
dereferences the pointers directly.

Tested x86_64-linux, powerpc64le-linux, powerpc64-linux,
powerpc-ibm-aix7.2.0.0 (which has 2-byte wchar_t).

Also tested with ubsan to confirm the unaligned accesses are gone.

Committed to trunk, gcc-6-branch, gcc-5-branch.


[-- Attachment #2: patch.txt --]
[-- Type: text/x-patch, Size: 36289 bytes --]

commit 96ebc791ce1bd9cbba913d0b25b60ee4a09c41f1
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Fri Mar 17 13:00:00 2017 +0000

    Fix alignment bugs in std::codecvt_utf16
    
    	* src/c++11/codecvt.cc (range): Add non-type template parameter and
    	define oerloaded operators for reading and writing code units.
    	(range<Elem, false>): Define partial specialization for accessing
    	wide characters in potentially unaligned byte ranges.
    	(ucs2_span(const char16_t*, const char16_t*, ...))
    	(ucs4_span(const char16_t*, const char16_t*, ...)): Change parameters
    	to range<const char16_t, false> in order to avoid unaligned reads.
    	(__codecvt_utf16_base<char16_t>::do_out)
    	(__codecvt_utf16_base<char32_t>::do_out)
    	(__codecvt_utf16_base<wchar_t>::do_out): Use range specialization for
    	unaligned data to avoid unaligned writes.
    	(__codecvt_utf16_base<char16_t>::do_in)
    	(__codecvt_utf16_base<char32_t>::do_in)
    	(__codecvt_utf16_base<wchar_t>::do_in): Likewise for writes. Return
    	error if there are unprocessable trailing bytes.
    	(__codecvt_utf16_base<char16_t>::do_length)
    	(__codecvt_utf16_base<char32_t>::do_length)
    	(__codecvt_utf16_base<wchar_t>::do_length): Pass arguments of type
    	range<const char16_t, false> to span functions.
    	* testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc: New test.

diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index 02866ef..1187339 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -57,17 +57,104 @@ namespace
   const char32_t incomplete_mb_character = char32_t(-2);
   const char32_t invalid_mb_sequence = char32_t(-1);
 
-  template<typename Elem>
+  // Utility type for reading and writing code units of type Elem from
+  // a range defined by a pair of pointers.
+  template<typename Elem, bool Aligned = true>
     struct range
     {
       Elem* next;
       Elem* end;
 
+      // Write a code unit.
+      range& operator=(Elem e)
+      {
+	*next++ = e;
+	return *this;
+      }
+
+      // Read the next code unit.
       Elem operator*() const { return *next; }
 
-      range& operator++() { ++next; return *this; }
+      // Read the Nth code unit.
+      Elem operator[](size_t n) const { return next[n]; }
 
+      // Move to the next code unit.
+      range& operator++()
+      {
+	++next;
+	return *this;
+      }
+
+      // Move to the Nth code unit.
+      range& operator+=(size_t n)
+      {
+	next += n;
+	return *this;
+      }
+
+      // The number of code units remaining.
       size_t size() const { return end - next; }
+
+      // The number of bytes remaining.
+      size_t nbytes() const { return (const char*)end - (const char*)next; }
+    };
+
+  // This specialization is used when accessing char16_t values through
+  // pointers to char, which might not be correctly aligned for char16_t.
+  template<typename Elem>
+    struct range<Elem, false>
+    {
+      using value_type = typename remove_const<Elem>::type;
+
+      using char_pointer = typename
+	conditional<is_const<Elem>::value, const char*, char*>::type;
+
+      char_pointer next;
+      char_pointer end;
+
+      // Write a code unit.
+      range& operator=(Elem e)
+      {
+	memcpy(next, &e, sizeof(Elem));
+	++*this;
+	return *this;
+      }
+
+      // Read the next code unit.
+      Elem operator*() const
+      {
+	value_type e;
+	memcpy(&e, next, sizeof(Elem));
+	return e;
+      }
+
+      // Read the Nth code unit.
+      Elem operator[](size_t n) const
+      {
+	value_type e;
+	memcpy(&e, next + n * sizeof(Elem), sizeof(Elem));
+	return e;
+      }
+
+      // Move to the next code unit.
+      range& operator++()
+      {
+	next += sizeof(Elem);
+	return *this;
+      }
+
+      // Move to the Nth code unit.
+      range& operator+=(size_t n)
+      {
+	next += n * sizeof(Elem);
+	return *this;
+      }
+
+      // The number of code units remaining.
+      size_t size() const { return nbytes() / sizeof(Elem); }
+
+      // The number of bytes remaining.
+      size_t nbytes() const { return end - next; }
     };
 
   // Multibyte sequences can have "header" consisting of Byte Order Mark
@@ -75,17 +162,37 @@ namespace
   const unsigned char utf16_bom[2] = { 0xFE, 0xFF };
   const unsigned char utf16le_bom[2] = { 0xFF, 0xFE };
 
-  template<size_t N>
-    inline bool
-    write_bom(range<char>& to, const unsigned char (&bom)[N])
+  // Write a BOM (space permitting).
+  template<typename C, bool A, size_t N>
+    bool
+    write_bom(range<C, A>& to, const unsigned char (&bom)[N])
     {
-      if (to.size() < N)
+      static_assert( (N / sizeof(C)) != 0, "" );
+      static_assert( (N % sizeof(C)) == 0, "" );
+
+      if (to.nbytes() < N)
 	return false;
       memcpy(to.next, bom, N);
-      to.next += N;
+      to += (N / sizeof(C));
       return true;
     }
 
+  // Try to read a BOM.
+  template<typename C, bool A, size_t N>
+    bool
+    read_bom(range<C, A>& from, const unsigned char (&bom)[N])
+    {
+      static_assert( (N / sizeof(C)) != 0, "" );
+      static_assert( (N % sizeof(C)) == 0, "" );
+
+      if (from.nbytes() >= N && !memcmp(from.next, bom, N))
+	{
+	  from += (N / sizeof(C));
+	  return true;
+	}
+      return false;
+    }
+
   // If generate_header is set in mode write out UTF-8 BOM.
   bool
   write_utf8_bom(range<char>& to, codecvt_mode mode)
@@ -97,32 +204,20 @@ namespace
 
   // If generate_header is set in mode write out the UTF-16 BOM indicated
   // by whether little_endian is set in mode.
+  template<bool Aligned>
   bool
-  write_utf16_bom(range<char16_t>& to, codecvt_mode mode)
+  write_utf16_bom(range<char16_t, Aligned>& to, codecvt_mode mode)
   {
     if (mode & generate_header)
     {
-      if (!to.size())
-	return false;
-      auto* bom = (mode & little_endian) ? utf16le_bom : utf16_bom;
-      std::memcpy(to.next, bom, 2);
-      ++to.next;
+      if (mode & little_endian)
+	return write_bom(to, utf16le_bom);
+      else
+	return write_bom(to, utf16_bom);
     }
     return true;
   }
 
-  template<size_t N>
-    inline bool
-    read_bom(range<const char>& from, const unsigned char (&bom)[N])
-    {
-      if (from.size() >= N && !memcmp(from.next, bom, N))
-	{
-	  from.next += N;
-	  return true;
-	}
-      return false;
-    }
-
   // If consume_header is set in mode update from.next to after any BOM.
   void
   read_utf8_bom(range<const char>& from, codecvt_mode mode)
@@ -135,21 +230,16 @@ namespace
   // Otherwise, if *from.next is a UTF-16 BOM increment from.next and then:
   // - if the UTF-16BE BOM was found unset little_endian in mode, or
   // - if the UTF-16LE BOM was found set little_endian in mode.
+  template<bool Aligned>
   void
-  read_utf16_bom(range<const char16_t>& from, codecvt_mode& mode)
+  read_utf16_bom(range<const char16_t, Aligned>& from, codecvt_mode& mode)
   {
-    if (mode & consume_header && from.size())
+    if (mode & consume_header)
       {
-	if (!memcmp(from.next, utf16_bom, 2))
-	  {
-	    ++from.next;
-	    mode &= ~little_endian;
-	  }
-	else if (!memcmp(from.next, utf16le_bom, 2))
-	  {
-	    ++from.next;
-	    mode |= little_endian;
-	  }
+	if (read_bom(from, utf16_bom))
+	  mode &= ~little_endian;
+	else if (read_bom(from, utf16le_bom))
+	  mode |= little_endian;
       }
   }
 
@@ -162,11 +252,11 @@ namespace
     const size_t avail = from.size();
     if (avail == 0)
       return incomplete_mb_character;
-    unsigned char c1 = from.next[0];
+    unsigned char c1 = from[0];
     // https://en.wikipedia.org/wiki/UTF-8#Sample_code
     if (c1 < 0x80)
     {
-      ++from.next;
+      ++from;
       return c1;
     }
     else if (c1 < 0xC2) // continuation or overlong 2-byte sequence
@@ -175,51 +265,51 @@ namespace
     {
       if (avail < 2)
 	return incomplete_mb_character;
-      unsigned char c2 = from.next[1];
+      unsigned char c2 = from[1];
       if ((c2 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
       char32_t c = (c1 << 6) + c2 - 0x3080;
       if (c <= maxcode)
-	from.next += 2;
+	from += 2;
       return c;
     }
     else if (c1 < 0xF0) // 3-byte sequence
     {
       if (avail < 3)
 	return incomplete_mb_character;
-      unsigned char c2 = from.next[1];
+      unsigned char c2 = from[1];
       if ((c2 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
       if (c1 == 0xE0 && c2 < 0xA0) // overlong
 	return invalid_mb_sequence;
-      unsigned char c3 = from.next[2];
+      unsigned char c3 = from[2];
       if ((c3 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
       char32_t c = (c1 << 12) + (c2 << 6) + c3 - 0xE2080;
       if (c <= maxcode)
-	from.next += 3;
+	from += 3;
       return c;
     }
     else if (c1 < 0xF5) // 4-byte sequence
     {
       if (avail < 4)
 	return incomplete_mb_character;
-      unsigned char c2 = from.next[1];
+      unsigned char c2 = from[1];
       if ((c2 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
       if (c1 == 0xF0 && c2 < 0x90) // overlong
 	return invalid_mb_sequence;
       if (c1 == 0xF4 && c2 >= 0x90) // > U+10FFFF
       return invalid_mb_sequence;
-      unsigned char c3 = from.next[2];
+      unsigned char c3 = from[2];
       if ((c3 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
-      unsigned char c4 = from.next[3];
+      unsigned char c4 = from[3];
       if ((c4 & 0xC0) != 0x80)
 	return invalid_mb_sequence;
       char32_t c = (c1 << 18) + (c2 << 12) + (c3 << 6) + c4 - 0x3C82080;
       if (c <= maxcode)
-	from.next += 4;
+	from += 4;
       return c;
     }
     else // > U+10FFFF
@@ -233,31 +323,31 @@ namespace
       {
 	if (to.size() < 1)
 	  return false;
-	*to.next++ = code_point;
+	to = code_point;
       }
     else if (code_point <= 0x7FF)
       {
 	if (to.size() < 2)
 	  return false;
-	*to.next++ = (code_point >> 6) + 0xC0;
-	*to.next++ = (code_point & 0x3F) + 0x80;
+	to = (code_point >> 6) + 0xC0;
+	to = (code_point & 0x3F) + 0x80;
       }
     else if (code_point <= 0xFFFF)
       {
 	if (to.size() < 3)
 	  return false;
-	*to.next++ = (code_point >> 12) + 0xE0;
-	*to.next++ = ((code_point >> 6) & 0x3F) + 0x80;
-	*to.next++ = (code_point & 0x3F) + 0x80;
+	to = (code_point >> 12) + 0xE0;
+	to = ((code_point >> 6) & 0x3F) + 0x80;
+	to = (code_point & 0x3F) + 0x80;
       }
     else if (code_point <= 0x10FFFF)
       {
 	if (to.size() < 4)
 	  return false;
-	*to.next++ = (code_point >> 18) + 0xF0;
-	*to.next++ = ((code_point >> 12) & 0x3F) + 0x80;
-	*to.next++ = ((code_point >> 6) & 0x3F) + 0x80;
-	*to.next++ = (code_point & 0x3F) + 0x80;
+	to = (code_point >> 18) + 0xF0;
+	to = ((code_point >> 12) & 0x3F) + 0x80;
+	to = ((code_point >> 6) & 0x3F) + 0x80;
+	to = (code_point & 0x3F) + 0x80;
       }
     else
       return false;
@@ -298,38 +388,39 @@ namespace
   // The sequence's endianness is indicated by (mode & little_endian).
   // Updates from.next if the codepoint is not greater than maxcode.
   // Returns invalid_mb_sequence, incomplete_mb_character or the code point.
-  char32_t
-  read_utf16_code_point(range<const char16_t>& from, unsigned long maxcode,
-			codecvt_mode mode)
-  {
-    const size_t avail = from.size();
-    if (avail == 0)
-      return incomplete_mb_character;
-    int inc = 1;
-    char32_t c = adjust_byte_order(from.next[0], mode);
-    if (is_high_surrogate(c))
-      {
-	if (avail < 2)
-	  return incomplete_mb_character;
-	const char16_t c2 = adjust_byte_order(from.next[1], mode);
-	if (is_low_surrogate(c2))
-	  {
-	    c = surrogate_pair_to_code_point(c, c2);
-	    inc = 2;
-	  }
-	else
-	  return invalid_mb_sequence;
-      }
-    else if (is_low_surrogate(c))
-      return invalid_mb_sequence;
-    if (c <= maxcode)
-      from.next += inc;
-    return c;
-  }
+  template<bool Aligned>
+    char32_t
+    read_utf16_code_point(range<const char16_t, Aligned>& from,
+			  unsigned long maxcode, codecvt_mode mode)
+    {
+      const size_t avail = from.size();
+      if (avail == 0)
+	return incomplete_mb_character;
+      int inc = 1;
+      char32_t c = adjust_byte_order(from[0], mode);
+      if (is_high_surrogate(c))
+	{
+	  if (avail < 2)
+	    return incomplete_mb_character;
+	  const char16_t c2 = adjust_byte_order(from[1], mode);
+	  if (is_low_surrogate(c2))
+	    {
+	      c = surrogate_pair_to_code_point(c, c2);
+	      inc = 2;
+	    }
+	  else
+	    return invalid_mb_sequence;
+	}
+      else if (is_low_surrogate(c))
+	return invalid_mb_sequence;
+      if (c <= maxcode)
+	from += inc;
+      return c;
+    }
 
-  template<typename C>
+  template<typename C, bool A>
   bool
-  write_utf16_code_point(range<C>& to, char32_t codepoint, codecvt_mode mode)
+  write_utf16_code_point(range<C, A>& to, char32_t codepoint, codecvt_mode mode)
   {
     static_assert(sizeof(C) >= 2, "a code unit must be at least 16-bit");
 
@@ -337,8 +428,7 @@ namespace
       {
 	if (to.size() > 0)
 	  {
-	    *to.next = adjust_byte_order(codepoint, mode);
-	    ++to.next;
+	    to = adjust_byte_order(codepoint, mode);
 	    return true;
 	  }
       }
@@ -348,9 +438,8 @@ namespace
 	const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
 	char16_t lead = LEAD_OFFSET + (codepoint >> 10);
 	char16_t trail = 0xDC00 + (codepoint & 0x3FF);
-	to.next[0] = adjust_byte_order(lead, mode);
-	to.next[1] = adjust_byte_order(trail, mode);
-	to.next += 2;
+	to = adjust_byte_order(lead, mode);
+	to = adjust_byte_order(trail, mode);
 	return true;
       }
     return false;
@@ -369,7 +458,7 @@ namespace
 	  return codecvt_base::partial;
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
-	*to.next++ = codepoint;
+	to = codepoint;
       }
     return from.size() ? codecvt_base::partial : codecvt_base::ok;
   }
@@ -383,19 +472,19 @@ namespace
       return codecvt_base::partial;
     while (from.size())
       {
-	const char32_t c = from.next[0];
+	const char32_t c = from[0];
 	if (c > maxcode)
 	  return codecvt_base::error;
 	if (!write_utf8_code_point(to, c))
 	  return codecvt_base::partial;
-	++from.next;
+	++from;
       }
     return codecvt_base::ok;
   }
 
   // utf16 -> ucs4
   codecvt_base::result
-  ucs4_in(range<const char16_t>& from, range<char32_t>& to,
+  ucs4_in(range<const char16_t, false>& from, range<char32_t>& to,
           unsigned long maxcode = max_code_point, codecvt_mode mode = {})
   {
     read_utf16_bom(from, mode);
@@ -406,26 +495,26 @@ namespace
 	  return codecvt_base::partial;
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
-	*to.next++ = codepoint;
+	to = codepoint;
       }
     return from.size() ? codecvt_base::partial : codecvt_base::ok;
   }
 
   // ucs4 -> utf16
   codecvt_base::result
-  ucs4_out(range<const char32_t>& from, range<char16_t>& to,
+  ucs4_out(range<const char32_t>& from, range<char16_t, false>& to,
            unsigned long maxcode = max_code_point, codecvt_mode mode = {})
   {
     if (!write_utf16_bom(to, mode))
       return codecvt_base::partial;
     while (from.size())
       {
-	const char32_t c = from.next[0];
+	const char32_t c = from[0];
 	if (c > maxcode)
 	  return codecvt_base::error;
 	if (!write_utf16_code_point(to, c, mode))
 	  return codecvt_base::partial;
-	++from.next;
+	++from;
       }
     return codecvt_base::ok;
   }
@@ -443,7 +532,7 @@ namespace
     read_utf8_bom(from, mode);
     while (from.size() && to.size())
       {
-	const char* const first = from.next;
+	auto orig = from;
 	const char32_t codepoint = read_utf8_code_point(from, maxcode);
 	if (codepoint == incomplete_mb_character)
 	  {
@@ -456,7 +545,7 @@ namespace
 	  return codecvt_base::error;
 	if (!write_utf16_code_point(to, codepoint, mode))
 	  {
-	    from.next = first;
+	    from = orig; // rewind to previous position
 	    return codecvt_base::partial;
 	  }
       }
@@ -474,7 +563,7 @@ namespace
       return codecvt_base::partial;
     while (from.size())
       {
-	char32_t c = from.next[0];
+	char32_t c = from[0];
 	int inc = 1;
 	if (is_high_surrogate(c))
 	  {
@@ -484,7 +573,7 @@ namespace
 	    if (from.size() < 2)
 	      return codecvt_base::ok; // stop converting at this point
 
-	    const char32_t c2 = from.next[1];
+	    const char32_t c2 = from[1];
 	    if (is_low_surrogate(c2))
 	      {
 		c = surrogate_pair_to_code_point(c, c2);
@@ -499,7 +588,7 @@ namespace
 	  return codecvt_base::error;
 	if (!write_utf8_code_point(to, c))
 	  return codecvt_base::partial;
-	from.next += inc;
+	from += inc;
       }
     return codecvt_base::ok;
   }
@@ -548,27 +637,27 @@ namespace
 
   // ucs2 -> utf16
   codecvt_base::result
-  ucs2_out(range<const char16_t>& from, range<char16_t>& to,
+  ucs2_out(range<const char16_t>& from, range<char16_t, false>& to,
 	   char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
     if (!write_utf16_bom(to, mode))
       return codecvt_base::partial;
     while (from.size() && to.size())
       {
-	char16_t c = from.next[0];
+	char16_t c = from[0];
 	if (is_high_surrogate(c))
 	  return codecvt_base::error;
 	if (c > maxcode)
 	  return codecvt_base::error;
-	*to.next++ = adjust_byte_order(c, mode);
-	++from.next;
+	to = adjust_byte_order(c, mode);
+	++from;
       }
     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
   }
 
   // utf16 -> ucs2
   codecvt_base::result
-  ucs2_in(range<const char16_t>& from, range<char16_t>& to,
+  ucs2_in(range<const char16_t, false>& from, range<char16_t>& to,
 	  char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
     read_utf16_bom(from, mode);
@@ -581,23 +670,22 @@ namespace
 	  return codecvt_base::error; // UCS-2 only supports single units.
 	if (c > maxcode)
 	  return codecvt_base::error;
-	*to.next++ = c;
+	to = c;
       }
     return from.size() == 0 ? codecvt_base::ok : codecvt_base::partial;
   }
 
   const char16_t*
-  ucs2_span(const char16_t* begin, const char16_t* end, size_t max,
+  ucs2_span(range<const char16_t, false>& from, size_t max,
             char32_t maxcode, codecvt_mode mode)
   {
-    range<const char16_t> from{ begin, end };
     read_utf16_bom(from, mode);
     // UCS-2 only supports characters in the BMP, i.e. one UTF-16 code unit:
     maxcode = std::min(max_single_utf16_unit, maxcode);
     char32_t c = 0;
     while (max-- && c <= maxcode)
       c = read_utf16_code_point(from, maxcode, mode);
-    return from.next;
+    return reinterpret_cast<const char16_t*>(from.next);
   }
 
   const char*
@@ -629,15 +717,14 @@ namespace
 
   // return pos such that [begin,pos) is valid UCS-4 string no longer than max
   const char16_t*
-  ucs4_span(const char16_t* begin, const char16_t* end, size_t max,
+  ucs4_span(range<const char16_t, false>& from, size_t max,
             char32_t maxcode = max_code_point, codecvt_mode mode = {})
   {
-    range<const char16_t> from{ begin, end };
     read_utf16_bom(from, mode);
     char32_t c = 0;
     while (max-- && c <= maxcode)
       c = read_utf16_code_point(from, maxcode, mode);
-    return from.next;
+    return reinterpret_cast<const char16_t*>(from.next);
   }
 }
 
@@ -937,6 +1024,13 @@ __codecvt_utf8_base<char32_t>::do_max_length() const throw()
 }
 
 #ifdef _GLIBCXX_USE_WCHAR_T
+
+#if __SIZEOF_WCHAR_T__ == 2
+static_assert(sizeof(wchar_t) == sizeof(char16_t), "");
+#elif __SIZEOF_WCHAR_T__ == 4
+static_assert(sizeof(wchar_t) == sizeof(char32_t), "");
+#endif
+
 // Define members of codecvt_utf8<wchar_t> base class implementation.
 // Converts from UTF-8 to UCS-2 or UCS-4 depending on sizeof(wchar_t).
 
@@ -1057,10 +1151,7 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
        extern_type*& __to_next) const
 {
   range<const char16_t> from{ __from, __from_end };
-  range<char16_t> to{
-    reinterpret_cast<char16_t*>(__to),
-    reinterpret_cast<char16_t*>(__to_end)
-  };
+  range<char16_t, false> to{ __to, __to_end };
   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
   __from_next = from.next;
   __to_next = reinterpret_cast<char*>(to.next);
@@ -1083,14 +1174,13 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
       intern_type* __to, intern_type* __to_end,
       intern_type*& __to_next) const
 {
-  range<const char16_t> from{
-    reinterpret_cast<const char16_t*>(__from),
-    reinterpret_cast<const char16_t*>(__from_end)
-  };
+  range<const char16_t, false> from{ __from, __from_end };
   range<char16_t> to{ __to, __to_end };
   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
   __from_next = reinterpret_cast<const char*>(from.next);
   __to_next = to.next;
+  if (res == codecvt_base::ok && __from_next != __from_end)
+    res = codecvt_base::error;
   return res;
 }
 
@@ -1107,9 +1197,8 @@ __codecvt_utf16_base<char16_t>::
 do_length(state_type&, const extern_type* __from,
 	  const extern_type* __end, size_t __max) const
 {
-  auto next = reinterpret_cast<const char16_t*>(__from);
-  next = ucs2_span(next, reinterpret_cast<const char16_t*>(__end), __max,
-		   _M_maxcode, _M_mode);
+  range<const char16_t, false> from{ __from, __end };
+  const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
   return reinterpret_cast<const char*>(next) - __from;
 }
 
@@ -1137,10 +1226,7 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
        extern_type*& __to_next) const
 {
   range<const char32_t> from{ __from, __from_end };
-  range<char16_t> to{
-    reinterpret_cast<char16_t*>(__to),
-    reinterpret_cast<char16_t*>(__to_end)
-  };
+  range<char16_t, false> to{ __to, __to_end };
   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
   __from_next = from.next;
   __to_next = reinterpret_cast<char*>(to.next);
@@ -1163,14 +1249,13 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
       intern_type* __to, intern_type* __to_end,
       intern_type*& __to_next) const
 {
-  range<const char16_t> from{
-    reinterpret_cast<const char16_t*>(__from),
-    reinterpret_cast<const char16_t*>(__from_end)
-  };
+  range<const char16_t, false> from{ __from, __from_end };
   range<char32_t> to{ __to, __to_end };
   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
   __from_next = reinterpret_cast<const char*>(from.next);
   __to_next = to.next;
+  if (res == codecvt_base::ok && __from_next != __from_end)
+    res = codecvt_base::error;
   return res;
 }
 
@@ -1187,9 +1272,8 @@ __codecvt_utf16_base<char32_t>::
 do_length(state_type&, const extern_type* __from,
 	  const extern_type* __end, size_t __max) const
 {
-  auto next = reinterpret_cast<const char16_t*>(__from);
-  next = ucs4_span(next, reinterpret_cast<const char16_t*>(__end), __max,
-		   _M_maxcode, _M_mode);
+  range<const char16_t, false> from{ __from, __end };
+  const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
   return reinterpret_cast<const char*>(next) - __from;
 }
 
@@ -1217,20 +1301,17 @@ do_out(state_type&, const intern_type* __from, const intern_type* __from_end,
        extern_type* __to, extern_type* __to_end,
        extern_type*& __to_next) const
 {
-  range<char16_t> to{
-    reinterpret_cast<char16_t*>(__to),
-    reinterpret_cast<char16_t*>(__to_end)
-  };
+  range<char16_t, false> to{ __to, __to_end };
 #if __SIZEOF_WCHAR_T__ == 2
   range<const char16_t> from{
     reinterpret_cast<const char16_t*>(__from),
-    reinterpret_cast<const char16_t*>(__from_end)
+    reinterpret_cast<const char16_t*>(__from_end),
   };
   auto res = ucs2_out(from, to, _M_maxcode, _M_mode);
 #elif __SIZEOF_WCHAR_T__ == 4
   range<const char32_t> from{
     reinterpret_cast<const char32_t*>(__from),
-    reinterpret_cast<const char32_t*>(__from_end)
+    reinterpret_cast<const char32_t*>(__from_end),
   };
   auto res = ucs4_out(from, to, _M_maxcode, _M_mode);
 #else
@@ -1257,20 +1338,17 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
       intern_type* __to, intern_type* __to_end,
       intern_type*& __to_next) const
 {
-  range<const char16_t> from{
-    reinterpret_cast<const char16_t*>(__from),
-    reinterpret_cast<const char16_t*>(__from_end)
-  };
+  range<const char16_t, false> from{ __from, __from_end };
 #if __SIZEOF_WCHAR_T__ == 2
   range<char16_t> to{
     reinterpret_cast<char16_t*>(__to),
-    reinterpret_cast<char16_t*>(__to_end)
+    reinterpret_cast<char16_t*>(__to_end),
   };
   auto res = ucs2_in(from, to, _M_maxcode, _M_mode);
 #elif __SIZEOF_WCHAR_T__ == 4
   range<char32_t> to{
     reinterpret_cast<char32_t*>(__to),
-    reinterpret_cast<char32_t*>(__to_end)
+    reinterpret_cast<char32_t*>(__to_end),
   };
   auto res = ucs4_in(from, to, _M_maxcode, _M_mode);
 #else
@@ -1278,6 +1356,8 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 #endif
   __from_next = reinterpret_cast<const char*>(from.next);
   __to_next = reinterpret_cast<wchar_t*>(to.next);
+  if (res == codecvt_base::ok && __from_next != __from_end)
+    res = codecvt_base::error;
   return res;
 }
 
@@ -1294,13 +1374,11 @@ __codecvt_utf16_base<wchar_t>::
 do_length(state_type&, const extern_type* __from,
 	  const extern_type* __end, size_t __max) const
 {
-  auto next = reinterpret_cast<const char16_t*>(__from);
+  range<const char16_t, false> from{ __from, __end };
 #if __SIZEOF_WCHAR_T__ == 2
-  next = ucs2_span(next, reinterpret_cast<const char16_t*>(__end), __max,
-		   _M_maxcode, _M_mode);
+  const char16_t* next = ucs2_span(from, __max, _M_maxcode, _M_mode);
 #elif __SIZEOF_WCHAR_T__ == 4
-  next = ucs4_span(next, reinterpret_cast<const char16_t*>(__end), __max,
-		   _M_maxcode, _M_mode);
+  const char16_t* next = ucs4_span(from, __max, _M_maxcode, _M_mode);
 #endif
   return reinterpret_cast<const char*>(next) - __from;
 }
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
index 9383818..d8b9729 100644
--- a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/79980.cc
@@ -103,6 +103,31 @@ test07()
   VERIFY( conv.converted() == 5 );
 }
 
+void
+test08()
+{
+  // Read/write UTF-16 code units from data not correctly aligned for char16_t
+  Conv<char16_t, 0x10FFFF, std::generate_header> conv;
+  const char src[] = "-\xFE\xFF\0\x61\xAB\xCD";
+  auto out = conv.from_bytes(src + 1, src + 7);
+  VERIFY( out[0] == 0x0061 );
+  VERIFY( out[1] == 0xabcd );
+  auto bytes = conv.to_bytes(out);
+  VERIFY( bytes == std::string(src + 1, 6) );
+}
+
+void
+test09()
+{
+  // Read/write UTF-16 code units from data not correctly aligned for char16_t
+  Conv<char32_t, 0x10FFFF, std::generate_header> conv;
+  const char src[] = "-\xFE\xFF\xD8\x08\xDF\x45";
+  auto out = conv.from_bytes(src + 1, src + 7);
+  VERIFY( out == U"\U00012345" );
+  auto bytes = conv.to_bytes(out);
+  VERIFY( bytes == std::string(src + 1, 6) );
+}
+
 int main()
 {
   test01();
@@ -112,4 +137,6 @@ int main()
   test05();
   test06();
   test07();
+  test08();
+  test09();
 }
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc
new file mode 100644
index 0000000..0179c18
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/codecvt_utf16/misaligned.cc
@@ -0,0 +1,289 @@
+// Copyright (C) 2017 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-do run { target c++11 } }
+
+#include <locale>
+#include <codecvt>
+#include <testsuite_hooks.h>
+
+using std::codecvt_base;
+using std::codecvt_mode;
+using std::codecvt_utf16;
+using std::wstring_convert;
+using std::mbstate_t;
+
+constexpr codecvt_mode
+operator|(codecvt_mode m1, codecvt_mode m2)
+{
+  using underlying = std::underlying_type<codecvt_mode>::type;
+  return static_cast<codecvt_mode>(static_cast<underlying>(m1) | m2);
+}
+
+// Read/write UTF-16 code units from data not correctly aligned for char16_t
+
+void
+test01()
+{
+  mbstate_t st;
+  constexpr codecvt_mode m = std::consume_header|std::generate_header;
+  codecvt_utf16<char16_t, 0x10FFFF, m> conv;
+  const char src[] = "-\xFE\xFF\0\x61\xAB\xCD";
+  const char* const src_end = src + 7;
+
+  int len = conv.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+
+  char16_t dst[2];
+  char16_t* const dst_end = dst + 2;
+  char16_t* dst_next;
+  const char* src_cnext;
+  auto res = conv.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  char out[sizeof(src)] = { src[0] };
+  char* const out_end = out + 7;
+  char* out_next;
+  const char16_t* dst_cnext;
+  res = conv.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[1] );
+  VERIFY( out[2] == src[2] );
+  VERIFY( out[3] == src[3] );
+  VERIFY( out[4] == src[4] );
+  VERIFY( out[5] == src[5] );
+  VERIFY( out[6] == src[6] );
+
+  codecvt_utf16<char16_t, 0x10FFFF, m|std::little_endian> conv_le;
+
+  len = conv_le.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv_le.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+
+  res = conv_le.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  res = conv_le.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[2] );
+  VERIFY( out[2] == src[1] );
+  VERIFY( out[3] == src[4] );
+  VERIFY( out[4] == src[3] );
+  VERIFY( out[5] == src[6] );
+  VERIFY( out[6] == src[5] );
+}
+
+void
+test02()
+{
+  mbstate_t st;
+  constexpr codecvt_mode m = std::consume_header|std::generate_header;
+  codecvt_utf16<char32_t, 0x10FFFF, m> conv;
+  const char src[] = "-\xFE\xFF\0\x61\xAB\xCD\xD8\x08\xDF\x45";
+  const char* const src_end = src + 11;
+
+  int len = conv.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+  len = conv.length(st, src + 1, src_end, -1ul);
+  VERIFY( len == 10 );
+
+  char32_t dst[3];
+  char32_t* const dst_end = dst + 3;
+  char32_t* dst_next;
+  const char* src_cnext;
+  auto res = conv.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  VERIFY( dst[2] == 0x012345 );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  char out[sizeof(src)] = { src[0] };
+  char* const out_end = out + 11;
+  char* out_next;
+  const char32_t* dst_cnext;
+  res = conv.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[1] );
+  VERIFY( out[2] == src[2] );
+  VERIFY( out[3] == src[3] );
+  VERIFY( out[4] == src[4] );
+  VERIFY( out[5] == src[5] );
+  VERIFY( out[6] == src[6] );
+  VERIFY( out[7] == src[7] );
+  VERIFY( out[8] == src[8] );
+  VERIFY( out[9] == src[9] );
+  VERIFY( out[10] == src[10] );
+
+  codecvt_utf16<char32_t, 0x10FFFF, m|std::little_endian> conv_le;
+
+  len = conv_le.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv_le.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+  len = conv.length(st, src + 1, src_end, -1ul);
+  VERIFY( len == 10 );
+
+  res = conv_le.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  VERIFY( dst[2] == 0x012345 );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  res = conv_le.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[2] );
+  VERIFY( out[2] == src[1] );
+  VERIFY( out[3] == src[4] );
+  VERIFY( out[4] == src[3] );
+  VERIFY( out[5] == src[6] );
+  VERIFY( out[6] == src[5] );
+  VERIFY( out[7] == src[8] );
+  VERIFY( out[8] == src[7] );
+  VERIFY( out[9] == src[10] );
+  VERIFY( out[10] == src[9] );
+}
+
+void
+test03()
+{
+#ifdef _GLIBCXX_USE_WCHAR_T
+  mbstate_t st;
+  constexpr codecvt_mode m = std::consume_header|std::generate_header;
+  codecvt_utf16<wchar_t, 0x10FFFF, m> conv;
+  const char src[] = "-\xFE\xFF\0\x61\xAB\xCD\xD8\x08\xDF\x45";
+  const size_t in_len = sizeof(wchar_t) == 4 ? 11 : 7;
+  const size_t out_len = sizeof(wchar_t) == 4 ? 3 : 2;
+  const char* const src_end = src + in_len;
+
+  int len = conv.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+  if (sizeof(wchar_t) == 4)
+  {
+    len = conv.length(st, src + 1, src_end, -1ul);
+    VERIFY( len == 10 );
+  }
+
+  wchar_t dst[out_len];
+  wchar_t* const dst_end = dst + out_len;
+  wchar_t* dst_next;
+  const char* src_cnext;
+  auto res = conv.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  if (sizeof(wchar_t) == 4)
+    VERIFY( dst[2] == 0x012345 );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  char out[sizeof(src)] = { src[0] };
+  char* const out_end = out + in_len;
+  char* out_next;
+  const wchar_t* dst_cnext;
+  res = conv.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[1] );
+  VERIFY( out[2] == src[2] );
+  VERIFY( out[3] == src[3] );
+  VERIFY( out[4] == src[4] );
+  VERIFY( out[5] == src[5] );
+  VERIFY( out[6] == src[6] );
+  if (sizeof(wchar_t) == 4)
+  {
+    VERIFY( out[7] == src[7] );
+    VERIFY( out[8] == src[8] );
+    VERIFY( out[9] == src[9] );
+    VERIFY( out[10] == src[10] );
+  }
+
+  codecvt_utf16<wchar_t, 0x10FFFF, m|std::little_endian> conv_le;
+
+  len = conv_le.length(st, src + 1, src_end, 1);
+  VERIFY( len == 4 );
+  len = conv_le.length(st, src + 1, src_end, 2);
+  VERIFY( len == 6 );
+  if (sizeof(wchar_t) == 4)
+  {
+    len = conv.length(st, src + 1, src_end, -1ul);
+    VERIFY( len == 10 );
+  }
+
+  res = conv_le.in(st, src + 1, src_end, src_cnext, dst, dst_end, dst_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( dst[0] == 0x0061 );
+  VERIFY( dst[1] == 0xabcd );
+  if (sizeof(wchar_t) == 4)
+    VERIFY( dst[2] == 0x012345 );
+  VERIFY( src_cnext == src_end );
+  VERIFY( dst_next == dst_end );
+
+  res = conv_le.out(st, dst, dst_end, dst_cnext, out + 1, out_end, out_next);
+  VERIFY( res == codecvt_base::ok );
+  VERIFY( out_next == out_end );
+  VERIFY( dst_cnext == dst_end );
+  VERIFY( out[1] == src[2] );
+  VERIFY( out[2] == src[1] );
+  VERIFY( out[3] == src[4] );
+  VERIFY( out[4] == src[3] );
+  VERIFY( out[5] == src[6] );
+  VERIFY( out[6] == src[5] );
+  if (sizeof(wchar_t) == 4)
+  {
+    VERIFY( out[7] == src[8] );
+    VERIFY( out[8] == src[7] );
+    VERIFY( out[9] == src[10] );
+    VERIFY( out[10] == src[9] );
+  }
+#endif
+}
+
+int
+main()
+{
+  test01();
+  test02();
+  test03();
+}

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2017-03-17 19:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-13 19:36 [PATCH] Various fixes for <codecvt> facets Jonathan Wakely
2017-03-14 18:46 ` Jonathan Wakely
2017-03-16 15:23   ` Jonathan Wakely
2017-03-16 17:22     ` Jonathan Wakely
2017-03-17 19:29     ` Jonathan Wakely

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).