From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <SRS0=uU6R=G6=redhat.com=jwakely@sourceware.org>
Received: from us-smtp-delivery-124.mimecast.com (us-smtp-delivery-124.mimecast.com [170.10.133.124])
	by sourceware.org (Postfix) with ESMTPS id 680743858425
	for <libstdc++@gcc.gnu.org>; Fri, 17 Nov 2023 16:03:25 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 680743858425
Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=redhat.com
Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=redhat.com
ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 680743858425
Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=170.10.133.124
ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1700237014; cv=none;
	b=gixGm5mUaOB/ZjmNMIVpeNggi/wA1tES64vGs22AMsn3YyStkuoOlHyLUKjkhKE1VWNjB9X/u3XcsL2FuON8InttKY4KLWwk7dMpYeQkkmNO9ZCH08K0593VSXZRIxR9VqsZ/I0Ajqr+Cmwh0zWk3GVHWjoeoF6rp4n+ZLQEoI0=
ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key;
	t=1700237014; c=relaxed/simple;
	bh=61DNr61O6DOp7uQHOFV3iwoxeZj2HAcaBtEFh0VEtzk=;
	h=DKIM-Signature:From:To:Subject:Date:Message-ID:MIME-Version; b=Qn9oZ7ZQo5Y2AM03UqK2g6HtP6mPFsnOcU2/E4/qbtbF9dmg5ParUsTXlGEpCWd4M2uW0ZPXLB4MnPrFANp26gOdsZ65xeIHQ/EA/IgSgr1Q5bqXdBmO4Ajrf2EFzigxuNbFJLFhCOo1eVutTVZpbOQ7s15YbDiDttxObQHAu04=
ARC-Authentication-Results: i=1; server2.sourceware.org
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=redhat.com;
	s=mimecast20190719; t=1700237004;
	h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
	 to:to:cc:mime-version:mime-version:content-type:content-type:
	 content-transfer-encoding:content-transfer-encoding:
	 in-reply-to:in-reply-to:references:references;
	bh=Wjs5bFeoN7/ebeGjQ+Mir01YLRW5/bl8X0DT5ffPrxU=;
	b=SfCCIO7nLAtT1Jgaf0FzPJWHG7Yd4bJXoou1VOuyW5eVdNFYCTjdz2gkF0D9Ln+T6rFxh/
	tVwupaSQN7Jk2/uXnI8Ge6mk2RyQPv5uhzcRS6E4mPGXhFissnjQUziK7WdchT33XwxJar
	k3kbrlLICcn344dfHbQj7Zizyvsj3NY=
Received: from mimecast-mx02.redhat.com (mimecast-mx02.redhat.com
 [66.187.233.88]) by relay.mimecast.com with ESMTP with STARTTLS
 (version=TLSv1.3, cipher=TLS_AES_256_GCM_SHA384) id
 us-mta-616-f4hxRp4INU-0aHTShvQEsQ-1; Fri, 17 Nov 2023 11:03:23 -0500
X-MC-Unique: f4hxRp4INU-0aHTShvQEsQ-1
Received: from smtp.corp.redhat.com (int-mx05.intmail.prod.int.rdu2.redhat.com [10.11.54.5])
	(using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)
	 key-exchange X25519 server-signature RSA-PSS (2048 bits) server-digest SHA256)
	(No client certificate requested)
	by mimecast-mx02.redhat.com (Postfix) with ESMTPS id C8742185A786;
	Fri, 17 Nov 2023 16:03:22 +0000 (UTC)
Received: from localhost (unknown [10.42.28.9])
	by smtp.corp.redhat.com (Postfix) with ESMTP id 929EF5028;
	Fri, 17 Nov 2023 16:03:22 +0000 (UTC)
From: Jonathan Wakely <jwakely@redhat.com>
To: libstdc++@gcc.gnu.org,
	gcc-patches@gcc.gnu.org
Subject: [PATCH 2/2] libstdc++: Ensure valid UTF-8 in std::vprint_unicode
Date: Fri, 17 Nov 2023 15:54:39 +0000
Message-ID: <20231117160320.1513815-2-jwakely@redhat.com>
In-Reply-To: <20231117160320.1513815-1-jwakely@redhat.com>
References: <20231117160320.1513815-1-jwakely@redhat.com>
MIME-Version: 1.0
X-Scanned-By: MIMEDefang 3.4.1 on 10.11.54.5
X-Mimecast-Spam-Score: 0
X-Mimecast-Originator: redhat.com
Content-Type: text/plain
Content-Transfer-Encoding: 8bit
X-Spam-Status: No, score=-11.8 required=5.0 tests=BAYES_00,DKIMWL_WL_HIGH,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,GIT_PATCH_0,RCVD_IN_DNSWL_NONE,RCVD_IN_MSPIKE_H3,RCVD_IN_MSPIKE_WL,SPF_HELO_NONE,SPF_NONE,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org
List-Id: <libstdc++.gcc.gnu.org>

This is a naive implementation of the UTF-8 validation algorithm, which
could definitely be optimized. But it's faster than using
std::codecvt_utf8 and checking the result of that, which is the only
existing code we have to do it in the library.

As the TODO suggests, we could do the UTF-8 to UTF-16 conversion at the
same time. But that is only needed for Windows and as I said in the 1/2
email, the output for Windows seems to be broken currently anyway and I
can't test it properly.

-- >8 --

libstdc++-v3/ChangeLog:

	* include/bits/locale_conv.h (__to_valid_utf8): New function.
	* include/std/ostream (vprint_unicode): Use it.
	* include/std/print (vprint_unicode): Use it.
---
 libstdc++-v3/include/bits/locale_conv.h | 104 ++++++++++++++++++++++++
 libstdc++-v3/include/std/ostream        |  74 +++++++++++------
 libstdc++-v3/include/std/print          |   8 +-
 3 files changed, 160 insertions(+), 26 deletions(-)

diff --git a/libstdc++-v3/include/bits/locale_conv.h b/libstdc++-v3/include/bits/locale_conv.h
index 284142a360a..f6ade1d0395 100644
--- a/libstdc++-v3/include/bits/locale_conv.h
+++ b/libstdc++-v3/include/bits/locale_conv.h
@@ -624,6 +624,110 @@ _GLIBCXX_END_NAMESPACE_CXX11
       bool			_M_always_noconv;
     };
 
+#if __cplusplus >= 202002L
+  template<typename _CharT = char>
+  bool
+  __to_valid_utf8(string& __s)
+  {
+    // TODO if _CharT is wchar_t then transcode at the same time.
+
+    unsigned __seen = 0, __needed = 0;
+    unsigned char __lo_bound = 0x80, __hi_bound = 0xBF;
+    size_t __errors = 0;
+
+    auto __q = __s.data(), __eoq = __q + __s.size();
+    while (__q != __eoq)
+      {
+	unsigned char __byte = *__q;
+	if (__needed == 0)
+	  {
+	    if (__byte <= 0x7F)      // 0x00 to 0x7F
+	      {
+		while (++__q != __eoq && (unsigned char)*__q <= 0x7F)
+		  { } // Fast forward to the next non-ASCII character.
+		continue;
+	      }
+	    else if (__byte < 0xC2)
+	      {
+		*__q = 0xFF;
+		++__errors;
+	      }
+	    else if (__byte <= 0xDF) // 0xC2 to 0xDF
+	      {
+		__needed = 1;
+	      }
+	    else if (__byte <= 0xEF) // 0xE0 to 0xEF
+	      {
+		if (__byte == 0xE0)
+		  __lo_bound = 0xA0;
+		else if (__byte == 0xED)
+		  __hi_bound = 0x9F;
+
+		__needed = 2;
+	      }
+	    else if (__byte <= 0xF4) // 0xF0 to 0xF4
+	      {
+		if (__byte == 0xF0)
+		  __lo_bound = 0x90;
+		else if (__byte == 0xF4)
+		  __hi_bound = 0x8F;
+
+		__needed = 3;
+	      }
+	    else
+	      {
+		*__q = 0xFF;
+		++__errors;
+	      }
+	  }
+	else
+	  {
+	    if (__byte < __lo_bound || __byte > __hi_bound)
+	      {
+		*(__q - __seen - 1) = 0xFF;
+		__builtin_memset(__q - __seen, 0xFE, __seen);
+		++__errors;
+		__needed = __seen = 0;
+		__lo_bound = 0x80;
+		__hi_bound = 0xBF;
+		continue; // Reprocess the current character.
+	      }
+
+	    __lo_bound = 0x80;
+	    __hi_bound = 0xBF;
+	    ++__seen;
+	    if (__seen == __needed)
+	      __needed = __seen = 0;
+	  }
+	__q++;
+      }
+
+    if (__needed)
+      {
+	// The string ends with an incomplete multibyte sequence.
+	if (__seen)
+	  __s.resize(__s.size() - __seen);
+	__s.back() = 0xFF;
+	++__errors;
+      }
+
+    if (__errors == 0)
+      return true;
+
+    string __s2;
+    __s2.reserve(__s.size() + __errors * 2);
+    for (unsigned char __byte : __s)
+      {
+	if (__byte == 0xFF)
+	  __s2 += "\uFFFD";
+	else if (__byte != 0xFE)
+	  __s2 += (char)__byte;
+      }
+    __s = std::move(__s2);
+    return false;
+  }
+#endif // C++20
+
   /// @} group locales
 
 _GLIBCXX_END_NAMESPACE_VERSION
diff --git a/libstdc++-v3/include/std/ostream b/libstdc++-v3/include/std/ostream
index e81c39a7c80..760aaa206da 100644
--- a/libstdc++-v3/include/std/ostream
+++ b/libstdc++-v3/include/std/ostream
@@ -917,42 +917,68 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   inline void
   vprint_unicode(ostream& __os, string_view __fmt, format_args __args)
   {
-    // TODO: diagnose invalid UTF-8 code units
-#ifdef _WIN32
-    int __fd_for_console(std::streambuf*);
-    void __write_utf16_to_console(int, string);
-
-    // If stream refers to a terminal convert to UTF-16 and use WriteConsoleW.
-    if (int __fd = __fd_for_console(__os.rdbuf()); __fd >= 0)
+    ostream::sentry __cerb(__os);
+    if (__cerb)
       {
-	ostream::sentry __cerb(__os);
-	if (__cerb)
+	string __out = std::vformat(__fmt, __args);
+	std::__to_valid_utf8(__out);
+
+#ifdef _WIN32
+	int __fd_for_console(std::streambuf*);
+	void __write_utf16_to_console(int, string);
+
+	// If stream refers to a terminal output UTF-16 using WriteConsoleW.
+	if (int __fd = __fd_for_console(__os.rdbuf()); __fd >= 0)
 	  {
-	    string __out = std::vformat(__fmt, __args);
 	    ios_base::iostate __err = ios_base::goodbit;
 	    __try
-	      {
-		if (__os.rdbuf()->pubsync() == -1)
-		  __err = ios::badbit;
-		else if (__write_utf16_to_console(__fd, __out))
-		  __err = ios::badbit;
-	      }
+	    {
+	      if (__os.rdbuf()->pubsync() == -1)
+		__err = ios::badbit;
+	      else if (__write_utf16_to_console(__fd, __out))
+		__err = ios::badbit;
+	    }
 	    __catch(const __cxxabiv1::__forced_unwind&)
-	      {
-		__os._M_setstate(ios_base::badbit);
-		__throw_exception_again;
-	      }
+	    {
+	      __os._M_setstate(ios_base::badbit);
+	      __throw_exception_again;
+	    }
 	    __catch(...)
-	      { __os._M_setstate(ios_base::badbit); }
+	    { __os._M_setstate(ios_base::badbit); }
 
 	    if (__err)
 	      __os.setstate(__err);
+	    return;
 	  }
-      }
 #endif
-    std::vprint_nonunicode(__os, __fmt, __args);
-  }
 
+	__try
+	  {
+	    const streamsize __w = __os.width();
+	    const streamsize __n = __out.size();
+	    if (__w > __n)
+	      {
+		const bool __left
+		  = (__os.flags() & ios_base::adjustfield) == ios_base::left;
+		if (!__left)
+		  std::__ostream_fill(__os, __w - __n);
+		if (__os.good())
+		  std::__ostream_write(__os, __out.data(), __n);
+		if (__left && __os.good())
+		  std::__ostream_fill(__os, __w - __n);
+	      }
+	    else
+	      std::__ostream_write(__os, __out.data(), __n);
+	  }
+	__catch(const __cxxabiv1::__forced_unwind&)
+	  {
+	    __os._M_setstate(ios_base::badbit);
+	    __throw_exception_again;
+	  }
+	__catch(...)
+	  { __os._M_setstate(ios_base::badbit); }
+      }
+  }
 
   template<typename... _Args>
     inline void
diff --git a/libstdc++-v3/include/std/print b/libstdc++-v3/include/std/print
index 75e78841247..096b97b1ef7 100644
--- a/libstdc++-v3/include/std/print
+++ b/libstdc++-v3/include/std/print
@@ -62,7 +62,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
   inline void
   vprint_unicode(FILE* __stream, string_view __fmt, format_args __args)
   {
-    // TODO: diagnose invalid UTF-8 code units
+    string __out = std::vformat(__fmt, __args);
+    std::__to_valid_utf8(__out);
+
 #ifdef _WIN32
     int __fd_for_console(FILE*);
     void __write_utf16_to_console(int, string);
@@ -82,7 +84,9 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
 	_GLIBCXX_THROW_OR_ABORT(system_error(__e, "std::vprint_unicode"));
       }
 #endif
-    std::vprint_nonunicode(__stream, __fmt, __args);
+
+    if (std::fwrite(__out.data(), 1, __out.size(), __stream) != __out.size())
+      __throw_system_error(EIO);
   }
 
   template<typename... _Args>
-- 
2.41.0