public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [committed] libstdc++: Fix handling of incomplete UTF-8 sequences in _Unicode_view
@ 2024-05-07 13:49 Jonathan Wakely
  0 siblings, 0 replies; only message in thread
From: Jonathan Wakely @ 2024-05-07 13:49 UTC (permalink / raw)
  To: libstdc++, gcc-patches

Tested x86_64-linux. Pushed to trunk. gcc-14 backport to follow.

-- >8 --

Eddie Nolan reported to me that _Unicode_view was not correctly
implementing the substitution of ill-formed subsequences with U+FFFD,
due to failing to increment the counter when the iterator reaches the
end of the sequence before a multibyte sequence is complete.  As a
result, the incomplete sequence was not completely consumed, and then
the remaining character was treated as another ill-formed sequence,
giving two U+FFFD characters instead of one.

To avoid similar mistakes in future, this change introduces a lambda
that increments the iterator and the counter together. This ensures the
counter is always incremented when the iterator is incremented, so that
we always know how many characters have been consumed.

libstdc++-v3/ChangeLog:

	* include/bits/unicode.h (_Unicode_view::_M_read_utf8): Ensure
	count of characters consumed is correct when the end of the
	input is reached unexpectedly.
	* testsuite/ext/unicode/view.cc: Test incomplete UTF-8
	sequences.
---
 libstdc++-v3/include/bits/unicode.h        | 24 ++++++++++------------
 libstdc++-v3/testsuite/ext/unicode/view.cc |  7 +++++++
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/libstdc++-v3/include/bits/unicode.h b/libstdc++-v3/include/bits/unicode.h
index 29813b743dc..46238143fb6 100644
--- a/libstdc++-v3/include/bits/unicode.h
+++ b/libstdc++-v3/include/bits/unicode.h
@@ -261,9 +261,13 @@ namespace __unicode
       {
 	_Guard<_Iter> __g{this, _M_curr()};
 	char32_t __c{};
-	uint8_t __u = *_M_curr()++;
 	const uint8_t __lo_bound = 0x80, __hi_bound = 0xBF;
+	uint8_t __u = *_M_curr()++;
 	uint8_t __to_incr = 1;
+	auto __incr = [&, this] {
+	  ++__to_incr;
+	  return ++_M_curr();
+	};
 
 	if (__u <= 0x7F) [[likely]]      // 0x00 to 0x7F
 	  __c = __u;
@@ -281,8 +285,7 @@ namespace __unicode
 	    else
 	      {
 		__c = (__c << 6) | (__u & 0x3F);
-		++_M_curr();
-		++__to_incr;
+		__incr();
 	      }
 	  }
 	else if (__u <= 0xEF) // 0xE0 to 0xEF
@@ -295,11 +298,10 @@ namespace __unicode
 
 	    if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
 	      __c = _S_error();
-	    else if (++_M_curr() == _M_last) [[unlikely]]
+	    else if (__incr() == _M_last) [[unlikely]]
 	      __c = _S_error();
 	    else
 	      {
-		++__to_incr;
 		__c = (__c << 6) | (__u & 0x3F);
 		__u = *_M_curr();
 
@@ -308,8 +310,7 @@ namespace __unicode
 		else
 		  {
 		    __c = (__c << 6) | (__u & 0x3F);
-		    ++_M_curr();
-		    ++__to_incr;
+		    __incr();
 		  }
 	      }
 	  }
@@ -323,21 +324,19 @@ namespace __unicode
 
 	    if (__u < __lo_bound_2 || __u > __hi_bound_2) [[unlikely]]
 	      __c = _S_error();
-	    else if (++_M_curr() == _M_last) [[unlikely]]
+	    else if (__incr() == _M_last) [[unlikely]]
 	      __c = _S_error();
 	    else
 	      {
-		++__to_incr;
 		__c = (__c << 6) | (__u & 0x3F);
 		__u = *_M_curr();
 
 		if (__u < __lo_bound || __u > __hi_bound) [[unlikely]]
 		  __c = _S_error();
-		else if (++_M_curr() == _M_last) [[unlikely]]
+		else if (__incr() == _M_last) [[unlikely]]
 		  __c = _S_error();
 		else
 		  {
-		    ++__to_incr;
 		    __c = (__c << 6) | (__u & 0x3F);
 		    __u = *_M_curr();
 
@@ -346,8 +345,7 @@ namespace __unicode
 		    else
 		      {
 			__c = (__c << 6) | (__u & 0x3F);
-			++_M_curr();
-			++__to_incr;
+			__incr();
 		      }
 		  }
 	      }
diff --git a/libstdc++-v3/testsuite/ext/unicode/view.cc b/libstdc++-v3/testsuite/ext/unicode/view.cc
index ee23b0b1d8a..6f3c099bd84 100644
--- a/libstdc++-v3/testsuite/ext/unicode/view.cc
+++ b/libstdc++-v3/testsuite/ext/unicode/view.cc
@@ -55,6 +55,13 @@ test_illformed_utf8()
   VERIFY( std::ranges::equal(v5, u8"\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\x41\uFFFD\uFFFD\x42"sv) );
   uc::_Utf8_view v6("\xe1\x80\xe2\xf0\x91\x92\xf1\xbf\x41"sv); // Table 3-11
   VERIFY( std::ranges::equal(v6, u8"\uFFFD\uFFFD\uFFFD\uFFFD\x41"sv) );
+
+  uc::_Utf32_view v7("\xe1\x80"sv);
+  VERIFY( std::ranges::equal(v7, U"\uFFFD"sv) );
+  uc::_Utf32_view v8("\xf1\x80"sv);
+  VERIFY( std::ranges::equal(v8, U"\uFFFD"sv) );
+  uc::_Utf32_view v9("\xf1\x80\x80"sv);
+  VERIFY( std::ranges::equal(v9, U"\uFFFD"sv) );
 }
 
 constexpr void
-- 
2.44.0


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-05-07 13:49 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-05-07 13:49 [committed] libstdc++: Fix handling of incomplete UTF-8 sequences in _Unicode_view Jonathan Wakely

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).