public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
* [PATCH] libcpp: Small incremental patch for P1854R4 [PR110341]
@ 2023-08-26 11:11 Jakub Jelinek
  2023-08-28 14:00 ` [PATCH] libcpp, v2: " Jakub Jelinek
  0 siblings, 1 reply; 2+ messages in thread
From: Jakub Jelinek @ 2023-08-26 11:11 UTC (permalink / raw)
  To: Jason Merrill; +Cc: gcc-patches

Hi!

The following incremental patch to the PR110341 posted patch uses
a special conversion callback instead of conversion from host charset
(UTF-8/UTF-EBCDIC) to UTF-32, and also ignores all diagnostics from the
second cpp_interpret_string which should just count chars.  The UTF-EBCDIC
is untested, but simple enough that it should just work.

2023-08-26  Jakub Jelinek  <jakub@redhat.com>

	PR c++/110341
	* charset.cc (one_count_chars, convert_count_chars): New functions.
	(narrow_str_to_charconst): Call cpp_interpret_string with type
	rather than CPP_STRING32, temporarily override for that call
	pfile->cb.diagnostic to noop_diagnostic_cb and
	pfile->narrow_cset_desc.func to convert_count_chars and just compare
	str.len against str2.len.

--- libcpp/charset.cc.jj	2023-08-25 17:14:14.098733396 +0200
+++ libcpp/charset.cc	2023-08-26 12:57:44.858858994 +0200
@@ -446,6 +446,74 @@ one_utf16_to_utf8 (iconv_t bigend, const
   return 0;
 }
 
+
+/* Special routine which just counts number of characters in the
+   string, what exactly is stored into the output doesn't matter
+   as long as it is one uchar per character.  */
+
+static inline int
+one_count_chars (iconv_t, const uchar **inbufp, size_t *inbytesleftp,
+		 uchar **outbufp, size_t *outbytesleftp)
+{
+  uchar *outbuf;
+  cppchar_t s = 0;
+  int rval;
+
+  /* Check for space first, since we know exactly how much we need.  */
+  if (*outbytesleftp < 1)
+    return E2BIG;
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+  if (rval)
+    return rval;
+#else
+  if (*inbytesleftp < 1)
+    return EINVAL;
+  static const uchar utf_ebcdic_map[256] = {
+    /* See table 4 in http://unicode.org/reports/tr16/tr16-7.2.html  */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+    1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1,
+    1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+    9, 9, 9, 9, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4,
+    1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 5, 5, 5,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 6, 6, 7, 7, 0
+  };
+  rval = utf_ebcdic_map[**inbufp];
+  if (rval == 9)
+    return EILSEQ;
+  if (rval == 0)
+    rval = 1;
+  if (rval >= 2)
+    {
+      if (*inbytesleftp < rval)
+	return EINVAL;
+      for (int i = 1; i < rval; ++i)
+	if (utf_ebcdic_map[(*inbufp)[i]] != 9)
+	  return EILSEQ;
+    }
+  *inbytesleftp -= rval;
+  *inbufp += rval;
+#endif
+
+  **outbufp = ' ';
+
+  *outbufp += 1;
+  *outbytesleftp -= 1;
+  return 0;
+}
+
+
 /* Helper routine for the next few functions.  The 'const' on
    one_conversion means that we promise not to modify what function is
    pointed to, which lets the inliner see through it.  */
@@ -529,6 +597,15 @@ convert_utf32_utf8 (iconv_t cd, const uc
   return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
 }
 
+/* Magic conversion which just counts characters from input, so
+   only to->len is significant.  */
+static bool
+convert_count_chars (iconv_t cd, const uchar *from,
+		     size_t flen, struct _cpp_strbuf *to)
+{
+  return conversion_loop (one_count_chars, cd, from, flen, to);
+}
+
 /* Identity conversion, used when we have no alternative.  */
 static bool
 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
@@ -2613,15 +2690,22 @@ narrow_str_to_charconst (cpp_reader *pfi
 	 ill-formed.  We need to count the number of c-chars and compare
 	 that to str.len.  */
       cpp_string str2 = { 0, 0 };
-      if (cpp_interpret_string (pfile, &token->val.str, 1, &str2,
-				CPP_STRING32))
+      bool (*saved_diagnostic_handler) (cpp_reader *, enum cpp_diagnostic_level,
+					enum cpp_warning_reason, rich_location *,
+					const char *, va_list *)
+	ATTRIBUTE_FPTR_PRINTF(5,0);
+      saved_diagnostic_handler = pfile->cb.diagnostic;
+      pfile->cb.diagnostic = noop_diagnostic_cb;
+      convert_f save_func = pfile->narrow_cset_desc.func;
+      pfile->narrow_cset_desc.func = convert_count_chars;
+      bool ret = cpp_interpret_string (pfile, &token->val.str, 1, &str2, type);
+      pfile->narrow_cset_desc.func = save_func;
+      pfile->cb.diagnostic = saved_diagnostic_handler;
+      if (ret)
 	{
-	  size_t width32 = converter_for_type (pfile, CPP_STRING32).width;
-	  size_t nbwc = width32 / width;
-	  size_t len = str2.len / nbwc;
 	  if (str2.text != token->val.str.text)
 	    free ((void *)str2.text);
-	  if (str.len > len)
+	  if (str.len > str2.len)
 	    {
 	      diagnosed
 		= cpp_error (pfile, CPP_DL_PEDWARN,

	Jakub


^ permalink raw reply	[flat|nested] 2+ messages in thread

* [PATCH] libcpp, v2: Small incremental patch for P1854R4 [PR110341]
  2023-08-26 11:11 [PATCH] libcpp: Small incremental patch for P1854R4 [PR110341] Jakub Jelinek
@ 2023-08-28 14:00 ` Jakub Jelinek
  0 siblings, 0 replies; 2+ messages in thread
From: Jakub Jelinek @ 2023-08-28 14:00 UTC (permalink / raw)
  To: Jason Merrill; +Cc: gcc-patches

Hi!

Sorry, testing revealed an unused uchar *outbuf; declaration breaking the
build, here is the same patch with that one line removed,
bootstrapped/regtested on x86_64-linux and i686-linux (on top of the earlier
POR110341 patch).

On Sat, Aug 26, 2023 at 01:11:06PM +0200, Jakub Jelinek via Gcc-patches wrote:
> The following incremental patch to the PR110341 posted patch uses
> a special conversion callback instead of conversion from host charset
> (UTF-8/UTF-EBCDIC) to UTF-32, and also ignores all diagnostics from the
> second cpp_interpret_string which should just count chars.  The UTF-EBCDIC
> is untested, but simple enough that it should just work.

2023-08-28  Jakub Jelinek  <jakub@redhat.com>

	PR c++/110341
	* charset.cc (one_count_chars, convert_count_chars): New functions.
	(narrow_str_to_charconst): Call cpp_interpret_string with type
	rather than CPP_STRING32, temporarily override for that call
	pfile->cb.diagnostic to noop_diagnostic_cb and
	pfile->narrow_cset_desc.func to convert_count_chars and just compare
	str.len against str2.len.

--- libcpp/charset.cc.jj	2023-08-25 17:14:14.098733396 +0200
+++ libcpp/charset.cc	2023-08-28 12:57:44.858858994 +0200
@@ -446,6 +446,73 @@ one_utf16_to_utf8 (iconv_t bigend, const
   return 0;
 }
 
+
+/* Special routine which just counts number of characters in the
+   string, what exactly is stored into the output doesn't matter
+   as long as it is one uchar per character.  */
+
+static inline int
+one_count_chars (iconv_t, const uchar **inbufp, size_t *inbytesleftp,
+		 uchar **outbufp, size_t *outbytesleftp)
+{
+  cppchar_t s = 0;
+  int rval;
+
+  /* Check for space first, since we know exactly how much we need.  */
+  if (*outbytesleftp < 1)
+    return E2BIG;
+
+#if HOST_CHARSET == HOST_CHARSET_ASCII
+  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
+  if (rval)
+    return rval;
+#else
+  if (*inbytesleftp < 1)
+    return EINVAL;
+  static const uchar utf_ebcdic_map[256] = {
+    /* See table 4 in http://unicode.org/reports/tr16/tr16-7.2.html  */
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+    1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1,
+    1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
+    9, 9, 9, 9, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+    2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4,
+    1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 5, 5, 5,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 6, 6, 7, 7, 0
+  };
+  rval = utf_ebcdic_map[**inbufp];
+  if (rval == 9)
+    return EILSEQ;
+  if (rval == 0)
+    rval = 1;
+  if (rval >= 2)
+    {
+      if (*inbytesleftp < rval)
+	return EINVAL;
+      for (int i = 1; i < rval; ++i)
+	if (utf_ebcdic_map[(*inbufp)[i]] != 9)
+	  return EILSEQ;
+    }
+  *inbytesleftp -= rval;
+  *inbufp += rval;
+#endif
+
+  **outbufp = ' ';
+
+  *outbufp += 1;
+  *outbytesleftp -= 1;
+  return 0;
+}
+
+
 /* Helper routine for the next few functions.  The 'const' on
    one_conversion means that we promise not to modify what function is
    pointed to, which lets the inliner see through it.  */
@@ -529,6 +596,15 @@ convert_utf32_utf8 (iconv_t cd, const uc
   return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
 }
 
+/* Magic conversion which just counts characters from input, so
+   only to->len is significant.  */
+static bool
+convert_count_chars (iconv_t cd, const uchar *from,
+		     size_t flen, struct _cpp_strbuf *to)
+{
+  return conversion_loop (one_count_chars, cd, from, flen, to);
+}
+
 /* Identity conversion, used when we have no alternative.  */
 static bool
 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
@@ -2623,15 +2699,22 @@ narrow_str_to_charconst (cpp_reader *pfi
 	 ill-formed.  We need to count the number of c-chars and compare
 	 that to str.len.  */
       cpp_string str2 = { 0, 0 };
-      if (cpp_interpret_string (pfile, &token->val.str, 1, &str2,
-				CPP_STRING32))
+      bool (*saved_diagnostic_handler) (cpp_reader *, enum cpp_diagnostic_level,
+					enum cpp_warning_reason, rich_location *,
+					const char *, va_list *)
+	ATTRIBUTE_FPTR_PRINTF(5,0);
+      saved_diagnostic_handler = pfile->cb.diagnostic;
+      pfile->cb.diagnostic = noop_diagnostic_cb;
+      convert_f save_func = pfile->narrow_cset_desc.func;
+      pfile->narrow_cset_desc.func = convert_count_chars;
+      bool ret = cpp_interpret_string (pfile, &token->val.str, 1, &str2, type);
+      pfile->narrow_cset_desc.func = save_func;
+      pfile->cb.diagnostic = saved_diagnostic_handler;
+      if (ret)
 	{
-	  size_t width32 = converter_for_type (pfile, CPP_STRING32).width;
-	  size_t nbwc = width32 / width;
-	  size_t len = str2.len / nbwc;
 	  if (str2.text != token->val.str.text)
 	    free ((void *)str2.text);
-	  if (str.len > len)
+	  if (str.len > str2.len)
 	    {
 	      diagnosed
 		= cpp_error (pfile, CPP_DL_PEDWARN,


	Jakub


^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2023-08-28 14:00 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-26 11:11 [PATCH] libcpp: Small incremental patch for P1854R4 [PR110341] Jakub Jelinek
2023-08-28 14:00 ` [PATCH] libcpp, v2: " Jakub Jelinek

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).