public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [patch] Fix codecvt<char16_t, char, mbstate_t>
@ 2015-02-18 19:07 Jonathan Wakely
  2015-02-18 19:23 ` Jakub Jelinek
  2015-02-18 20:21 ` Jonathan Wakely
  0 siblings, 2 replies; 3+ messages in thread
From: Jonathan Wakely @ 2015-02-18 19:07 UTC (permalink / raw)
  To: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 316 bytes --]

While working on PR64797 I discovered that the codecvt<char16_t,...>
specialization was, erm, completely broken when creating UTF-16
surrogate pairs.

This fixes it and adds a test, based on the char32_t one I added to
the testsuite yesterday. Tested x86_64-linux (little-endian) and
powerpc64-linux (big-endian).



[-- Attachment #2: patch.txt --]
[-- Type: text/x-patch, Size: 6230 bytes --]

commit c0a8047982d0911f74647dba43d65f3b14113f1c
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Wed Feb 18 11:49:11 2015 +0000

    	* src/c++11/codecvt.cc (write_utf16_code_point): Fix code to output
    	surrogate pairs.
    	(utf16_in): Pass mode argument to write_utf16_code_point.
    	(codecvt<char16_t, char, mbstate_t>::do_in): Set mode according to
    	native byte order.
    	* testsuite/22_locale/codecvt/char16_t.cc: New.
    	* testsuite/22_locale/codecvt/in/wchar_t/1.cc: Fix typo.

diff --git a/libstdc++-v3/src/c++11/codecvt.cc b/libstdc++-v3/src/c++11/codecvt.cc
index 594dae6..aebd3f3 100644
--- a/libstdc++-v3/src/c++11/codecvt.cc
+++ b/libstdc++-v3/src/c++11/codecvt.cc
@@ -295,13 +295,10 @@ namespace
       {
 	// Algorithm from http://www.unicode.org/faq/utf_bom.html#utf16-4
 	const char32_t LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
-	const char32_t SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
 	char16_t lead = LEAD_OFFSET + (codepoint >> 10);
 	char16_t trail = 0xDC00 + (codepoint & 0x3FF);
-	char32_t utf16bytes = (lead << 10) + trail + SURROGATE_OFFSET;
-
-	to.next[0] = adjust_byte_order(utf16bytes >> 16, mode);
-	to.next[1] = adjust_byte_order(utf16bytes & 0xFFFF, mode);
+	to.next[0] = adjust_byte_order(lead, mode);
+	to.next[1] = adjust_byte_order(trail, mode);
 	to.next += 2;
 	return true;
       }
@@ -400,7 +397,7 @@ namespace
 	  return codecvt_base::partial;
 	if (codepoint > maxcode)
 	  return codecvt_base::error;
-	if (!write_utf16_code_point(to, codepoint, {}))
+	if (!write_utf16_code_point(to, codepoint, mode))
 	  {
 	    from.next = first;
 	    return codecvt_base::partial;
@@ -618,7 +615,12 @@ do_in(state_type&, const extern_type* __from, const extern_type* __from_end,
 {
   range<const char> from{ __from, __from_end };
   range<char16_t> to{ __to, __to_end };
-  auto res = utf16_in(from, to);
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+  codecvt_mode mode = {};
+#else
+  codecvt_mode mode = little_endian;
+#endif
+  auto res = utf16_in(from, to, max_code_point, mode);
   __from_next = from.next;
   __to_next = to.next;
   return res;
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
new file mode 100644
index 0000000..14477f5
--- /dev/null
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
@@ -0,0 +1,97 @@
+// Copyright (C) 2015 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// { dg-options "-std=gnu++11" }
+
+// [locale.codecvt], C++11 22.4.1.4.  specialization.
+
+#include <locale>
+#include <cstring>
+#include <testsuite_hooks.h>
+
+void
+test01()
+{
+  using namespace std;
+  typedef codecvt<char16_t, char, mbstate_t> codecvt_c16;
+  locale loc_c = locale::classic();
+  VERIFY(has_facet<codecvt_c16>(loc_c));
+  const codecvt_c16* const cvt = &use_facet<codecvt_c16>(loc_c);
+
+  VERIFY(!cvt->always_noconv());
+  VERIFY(cvt->max_length() == 3);
+  VERIFY(cvt->encoding() == 0);
+
+  const char u8dat[] = u8"H\U000000E4ll\U000000F6 \U0001F63F \U000056FD "
+    u8"\U0000222B f(\U000003BA) exp(-2\U000003C0\U000003C9) d\U000003BA "
+    u8"\U0001F6BF \U0001F6BF \U0001F648 \U00000413\U00000435\U0000043E"
+    u8"\U00000433\U00000440\U00000430\U00000444\U00000438\U0000044F \U0000FB05";
+  const char* const u8dat_end = std::end(u8dat);
+
+  const char16_t u16dat[] = u"H\U000000E4ll\U000000F6 \U0001F63F \U000056FD "
+    u"\U0000222B f(\U000003BA) exp(-2\U000003C0\U000003C9) d\U000003BA "
+    u"\U0001F6BF \U0001F6BF \U0001F648 \U00000413\U00000435\U0000043E"
+    u"\U00000433\U00000440\U00000430\U00000444\U00000438\U0000044F \U0000FB05";
+  const char16_t* const u16dat_end = std::end(u16dat);
+
+  {
+    const size_t len = u16dat_end - u16dat + 1;
+    char16_t* const buffer = new char16_t[len];
+    char16_t* const buffer_end = buffer + len;
+
+    const char* from_next;
+    char16_t* to_next;
+
+    codecvt_c16::state_type state01;
+    state01 = {};
+    codecvt_base::result res = cvt->in(state01, u8dat, u8dat_end, from_next,
+                                       buffer, buffer_end, to_next);
+
+    VERIFY(res == codecvt_base::ok);
+    VERIFY(from_next == u8dat_end);
+    VERIFY(std::memcmp((void*)buffer, (void*)u16dat, sizeof(u16dat)) == 0);
+
+    delete[] buffer;
+  }
+
+  {
+    const size_t len = u8dat_end - u8dat + 1;
+    char* const buffer = new char[len];
+    char* const buffer_end = buffer + len;
+
+    const char16_t* from_next;
+    char* to_next;
+
+    codecvt_c16::state_type state01;
+    state01 = {};
+    codecvt_base::result res = cvt->out(state01, u16dat, u16dat_end,
+from_next,
+                                        buffer, buffer_end, to_next);
+
+    VERIFY(res == codecvt_base::ok);
+    VERIFY(from_next == u16dat_end);
+    VERIFY(std::memcmp((void*)buffer, (void*)u8dat, sizeof(u8dat)) == 0);
+
+    delete[] buffer;
+  }
+}
+
+int
+main()
+{
+  test01();
+}
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/in/wchar_t/1.cc b/libstdc++-v3/testsuite/22_locale/codecvt/in/wchar_t/1.cc
index 1e682a6..ff0b657 100644
--- a/libstdc++-v3/testsuite/22_locale/codecvt/in/wchar_t/1.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/in/wchar_t/1.cc
@@ -25,7 +25,7 @@
 
 // Need to explicitly set the state(mbstate_t) to zero.
 // How to do this is not specified by the ISO C99 standard, so we
-// might need to add some operators to make the intuiative case
+// might need to add some operators to make the intuitive case
 // work:
 //   w_codecvt::state_type state00;
 //   state00 = 0;  

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [patch] Fix codecvt<char16_t, char, mbstate_t>
  2015-02-18 19:07 [patch] Fix codecvt<char16_t, char, mbstate_t> Jonathan Wakely
@ 2015-02-18 19:23 ` Jakub Jelinek
  2015-02-18 20:21 ` Jonathan Wakely
  1 sibling, 0 replies; 3+ messages in thread
From: Jakub Jelinek @ 2015-02-18 19:23 UTC (permalink / raw)
  To: Jonathan Wakely; +Cc: libstdc++, gcc-patches

On Wed, Feb 18, 2015 at 07:07:05PM +0000, Jonathan Wakely wrote:
> While working on PR64797 I discovered that the codecvt<char16_t,...>
> specialization was, erm, completely broken when creating UTF-16
> surrogate pairs.
> 
> This fixes it and adds a test, based on the char32_t one I added to
> the testsuite yesterday. Tested x86_64-linux (little-endian) and
> powerpc64-linux (big-endian).

Ok for trunk from RM POV, thanks.

> commit c0a8047982d0911f74647dba43d65f3b14113f1c
> Author: Jonathan Wakely <jwakely@redhat.com>
> Date:   Wed Feb 18 11:49:11 2015 +0000
> 
>     	* src/c++11/codecvt.cc (write_utf16_code_point): Fix code to output
>     	surrogate pairs.
>     	(utf16_in): Pass mode argument to write_utf16_code_point.
>     	(codecvt<char16_t, char, mbstate_t>::do_in): Set mode according to
>     	native byte order.
>     	* testsuite/22_locale/codecvt/char16_t.cc: New.
>     	* testsuite/22_locale/codecvt/in/wchar_t/1.cc: Fix typo.

	Jakub

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [patch] Fix codecvt<char16_t, char, mbstate_t>
  2015-02-18 19:07 [patch] Fix codecvt<char16_t, char, mbstate_t> Jonathan Wakely
  2015-02-18 19:23 ` Jakub Jelinek
@ 2015-02-18 20:21 ` Jonathan Wakely
  1 sibling, 0 replies; 3+ messages in thread
From: Jonathan Wakely @ 2015-02-18 20:21 UTC (permalink / raw)
  To: libstdc++, gcc-patches

[-- Attachment #1: Type: text/plain, Size: 444 bytes --]

On 18/02/15 19:07 +0000, Jonathan Wakely wrote:
>While working on PR64797 I discovered that the codecvt<char16_t,...>
>specialization was, erm, completely broken when creating UTF-16
>surrogate pairs.
>
>This fixes it and adds a test, based on the char32_t one I added to
>the testsuite yesterday. Tested x86_64-linux (little-endian) and
>powerpc64-linux (big-endian).

Committed, along with this tweak to only run the tests where
supported.



[-- Attachment #2: patch.txt --]
[-- Type: text/x-patch, Size: 1102 bytes --]

commit ada5fdcd89fb1e91c43d2bfcba852fdaf27363b0
Author: Jonathan Wakely <jwakely@redhat.com>
Date:   Wed Feb 18 19:51:00 2015 +0000

    	* testsuite/22_locale/codecvt/char16_t.cc: Add dg-require-cstdint.
    	* testsuite/22_locale/codecvt/char32_t.cc: Likewise.

diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
index 14477f5..9271eca 100644
--- a/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/char16_t.cc
@@ -16,6 +16,7 @@
 // <http://www.gnu.org/licenses/>.
 
 // { dg-options "-std=gnu++11" }
+// { dg-require-cstdint "" }
 
 // [locale.codecvt], C++11 22.4.1.4.  specialization.
 
diff --git a/libstdc++-v3/testsuite/22_locale/codecvt/char32_t.cc b/libstdc++-v3/testsuite/22_locale/codecvt/char32_t.cc
index 07f72c4..ebf30ad 100644
--- a/libstdc++-v3/testsuite/22_locale/codecvt/char32_t.cc
+++ b/libstdc++-v3/testsuite/22_locale/codecvt/char32_t.cc
@@ -1,4 +1,5 @@
 // { dg-options "-std=gnu++11" }
+// { dg-require-cstdint "" }
 
 // 2014-04-24 Rüdiger Sonderfeld
 

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2015-02-18 20:21 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-18 19:07 [patch] Fix codecvt<char16_t, char, mbstate_t> Jonathan Wakely
2015-02-18 19:23 ` Jakub Jelinek
2015-02-18 20:21 ` Jonathan Wakely

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).