public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] iconv: Fix incorrect UCS4 inner loop bounds (BZ#26923)
@ 2020-11-19 18:40 Michael Colavita
  2020-11-24 13:58 ` Andreas Schwab
  0 siblings, 1 reply; 6+ messages in thread
From: Michael Colavita @ 2020-11-19 18:40 UTC (permalink / raw)
  To: libc-alpha; +Cc: Pádraig Brady, Neal Poole

[-- Attachment #1: Type: text/plain, Size: 601 bytes --]

Previously, in UCS4 conversion routines we limit the number of
characters we examine to the minimum of the number of characters in the
input and the number of characters in the output. This is not the
correct behavior when __GCONV_IGNORE_ERRORS is set, as we do not consume
an output character when we skip a code unit. Instead, track the input
and output pointers and terminate the loop when either reaches its
limit.

This resolves assertion failures when resetting the input buffer in a step of
iconv, which assumes that the input will be fully consumed given sufficient
output space.


[-- Attachment #2: 0001-iconv-Fix-incorrect-UCS4-inner-loop-bounds-BZ-26923.patch --]
[-- Type: application/octet-stream, Size: 5315 bytes --]

From a61ad9719d1832487ff3c11dd0507f662b1adec2 Mon Sep 17 00:00:00 2001
From: Michael Colavita <mcolavita@fb.com>
Date: Thu, 19 Nov 2020 11:44:40 -0500
Subject: [PATCH] iconv: Fix incorrect UCS4 inner loop bounds (BZ#26923)
To: libc-alpha@sourceware.org

Previously, in UCS4 conversion routines we limit the number of
characters we examine to the minimum of the number of characters in the
input and the number of characters in the output. This is not the
correct behavior when __GCONV_IGNORE_ERRORS is set, as we do not consume
an output character when we skip a code unit. Instead, track the input
and output pointers and terminate the loop when either reaches its
limit.

This resolves assertion failures when resetting the input buffer in a step of
iconv, which assumes that the input will be fully consumed given sufficient
output space.
---
 iconv/Makefile       |  2 +-
 iconv/gconv_simple.c | 16 ++++----------
 iconv/tst-iconv8.c   | 50 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 13 deletions(-)
 create mode 100644 iconv/tst-iconv8.c

diff --git a/iconv/Makefile b/iconv/Makefile
index 30bf996d3a..f9b51e23ec 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -44,7 +44,7 @@ CFLAGS-linereader.c += -DNO_TRANSLITERATION
 CFLAGS-simple-hash.c += -I../locale
 
 tests	= tst-iconv1 tst-iconv2 tst-iconv3 tst-iconv4 tst-iconv5 tst-iconv6 \
-	  tst-iconv7 tst-iconv-mt tst-iconv-opt
+	  tst-iconv7 tst-iconv8 tst-iconv-mt tst-iconv-opt
 
 others		= iconv_prog iconvconfig
 install-others-programs	= $(inst_bindir)/iconv
diff --git a/iconv/gconv_simple.c b/iconv/gconv_simple.c
index d4797fba17..963b29f246 100644
--- a/iconv/gconv_simple.c
+++ b/iconv/gconv_simple.c
@@ -239,11 +239,9 @@ ucs4_internal_loop (struct __gconv_step *step,
   int flags = step_data->__flags;
   const unsigned char *inptr = *inptrp;
   unsigned char *outptr = *outptrp;
-  size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
   int result;
-  size_t cnt;
 
-  for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
+  for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
     {
       uint32_t inval;
 
@@ -307,11 +305,9 @@ ucs4_internal_loop_unaligned (struct __gconv_step *step,
   int flags = step_data->__flags;
   const unsigned char *inptr = *inptrp;
   unsigned char *outptr = *outptrp;
-  size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
   int result;
-  size_t cnt;
 
-  for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
+  for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
     {
       if (__glibc_unlikely (inptr[0] > 0x80))
 	{
@@ -613,11 +609,9 @@ ucs4le_internal_loop (struct __gconv_step *step,
   int flags = step_data->__flags;
   const unsigned char *inptr = *inptrp;
   unsigned char *outptr = *outptrp;
-  size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
   int result;
-  size_t cnt;
 
-  for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
+  for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
     {
       uint32_t inval;
 
@@ -684,11 +678,9 @@ ucs4le_internal_loop_unaligned (struct __gconv_step *step,
   int flags = step_data->__flags;
   const unsigned char *inptr = *inptrp;
   unsigned char *outptr = *outptrp;
-  size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
   int result;
-  size_t cnt;
 
-  for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
+  for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
     {
       if (__glibc_unlikely (inptr[3] > 0x80))
 	{
diff --git a/iconv/tst-iconv8.c b/iconv/tst-iconv8.c
new file mode 100644
index 0000000000..0b92b19f66
--- /dev/null
+++ b/iconv/tst-iconv8.c
@@ -0,0 +1,50 @@
+/* Test iconv behavior on UCS4 conversions with //IGNORE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Derived from BZ #26923 */
+#include <errno.h>
+#include <iconv.h>
+#include <stdio.h>
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+  iconv_t cd = iconv_open ("UTF-8//IGNORE", "ISO-10646/UCS4/");
+  TEST_VERIFY_EXIT (cd != (iconv_t) -1);
+
+  /*
+   * Convert sequence beginning with an irreversible character into buffer that
+   * is too small.
+   */
+  char input[12] = "\xe1\x80\xa1" "AAAAAAAAA";
+  char *inptr = input;
+  size_t insize = sizeof (input);
+  char output[6];
+  char *outptr = output;
+  size_t outsize = sizeof (output);
+
+  TEST_VERIFY (iconv (cd, &inptr, &insize, &outptr, &outsize) == -1);
+  TEST_VERIFY (errno == E2BIG);
+
+  TEST_VERIFY_EXIT (iconv_close (cd) != -1);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
-- 
2.24.3 (Apple Git-128)


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] iconv: Fix incorrect UCS4 inner loop bounds (BZ#26923)
  2020-11-19 18:40 [PATCH] iconv: Fix incorrect UCS4 inner loop bounds (BZ#26923) Michael Colavita
@ 2020-11-24 13:58 ` Andreas Schwab
  2020-11-24 19:15   ` Michael Colavita
  0 siblings, 1 reply; 6+ messages in thread
From: Andreas Schwab @ 2020-11-24 13:58 UTC (permalink / raw)
  To: Michael Colavita via Libc-alpha
  Cc: Michael Colavita, Neal Poole, Pádraig Brady

On Nov 19 2020, Michael Colavita via Libc-alpha wrote:

> diff --git a/iconv/gconv_simple.c b/iconv/gconv_simple.c
> index d4797fba17..963b29f246 100644
> --- a/iconv/gconv_simple.c
> +++ b/iconv/gconv_simple.c
> @@ -239,11 +239,9 @@ ucs4_internal_loop (struct __gconv_step *step,
>    int flags = step_data->__flags;
>    const unsigned char *inptr = *inptrp;
>    unsigned char *outptr = *outptrp;
> -  size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
>    int result;
> -  size_t cnt;
>  
> -  for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
> +  for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)

This may exit the loop prematurely when the input ends in irreversible
characters.

Andreas.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] iconv: Fix incorrect UCS4 inner loop bounds (BZ#26923)
  2020-11-24 13:58 ` Andreas Schwab
@ 2020-11-24 19:15   ` Michael Colavita
  2020-12-02 16:43     ` Michael Colavita
  2020-12-04  4:03     ` Siddhesh Poyarekar
  0 siblings, 2 replies; 6+ messages in thread
From: Michael Colavita @ 2020-11-24 19:15 UTC (permalink / raw)
  To: Andreas Schwab
  Cc: Michael Colavita via Libc-alpha, Neal Poole, Pádraig Brady

> This may exit the loop prematurely when the input ends in irreversible
> characters.

To correct my explanation from before, resetting the input buffer assumes that
the output will be fully consumed given sufficient input. I don't believe it
affects correctness whether irreversible characters at the end are consumed or
not. If they are not skipped, they will be skipped in the next round of the
loop. The position to which we will have reset the input cursor is equivalent.

Currently the semantics of translation functions appear to be that reaching the
end of the output buffer terminates translation, even if there are further
irreversible characters that can be skipped. This is consistent across the
various routines.

However, if you think this behavior is undesirable, I can correct it across the
routines. If we pursue this route, I'm curious about the case when we are *not*
ignoring irreversible characters. If our output buffer is full and the next
input character is irreversible, would we continue to return
__GCONV_FULL_OUTPUT or instead return __GCONV_ILLEGAL_INPUT?

Let me know if you'd like me to pursue this and I can update the patch
accordingly (or submit the change separately if you agree that it does not
affect correctness).

Michael

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] iconv: Fix incorrect UCS4 inner loop bounds (BZ#26923)
  2020-11-24 19:15   ` Michael Colavita
@ 2020-12-02 16:43     ` Michael Colavita
  2020-12-04  4:03     ` Siddhesh Poyarekar
  1 sibling, 0 replies; 6+ messages in thread
From: Michael Colavita @ 2020-12-02 16:43 UTC (permalink / raw)
  To: Michael Colavita
  Cc: Andreas Schwab, Pádraig Brady, Neal Poole,
	Michael Colavita via Libc-alpha

Quick ping on this patch.

Michael

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] iconv: Fix incorrect UCS4 inner loop bounds (BZ#26923)
  2020-11-24 19:15   ` Michael Colavita
  2020-12-02 16:43     ` Michael Colavita
@ 2020-12-04  4:03     ` Siddhesh Poyarekar
  1 sibling, 0 replies; 6+ messages in thread
From: Siddhesh Poyarekar @ 2020-12-04  4:03 UTC (permalink / raw)
  To: Michael Colavita, Andreas Schwab
  Cc: Pádraig Brady, Neal Poole, Michael Colavita via Libc-alpha

On 11/25/20 12:45 AM, Michael Colavita via Libc-alpha wrote:
>> This may exit the loop prematurely when the input ends in irreversible
>> characters.
> 
> To correct my explanation from before, resetting the input buffer assumes that
> the output will be fully consumed given sufficient input. I don't believe it
> affects correctness whether irreversible characters at the end are consumed or
> not. If they are not skipped, they will be skipped in the next round of the
> loop. The position to which we will have reset the input cursor is equivalent.
> 
> Currently the semantics of translation functions appear to be that reaching the
> end of the output buffer terminates translation, even if there are further
> irreversible characters that can be skipped. This is consistent across the
> various routines.
> 
> However, if you think this behavior is undesirable, I can correct it across the
> routines. If we pursue this route, I'm curious about the case when we are *not*
> ignoring irreversible characters. If our output buffer is full and the next
> input character is irreversible, would we continue to return
> __GCONV_FULL_OUTPUT or instead return __GCONV_ILLEGAL_INPUT?
> 
> Let me know if you'd like me to pursue this and I can update the patch
> accordingly (or submit the change separately if you agree that it does not
> affect correctness).

This looks OK to me.  I'll commit the patch Nathan posted next week if 
Andreas does not have any further comments.

Thanks,
Siddhesh

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] iconv: Fix incorrect UCS4 inner loop bounds (BZ#26923)
@ 2020-11-20 20:24 Nathan Sidwell
  0 siblings, 0 replies; 6+ messages in thread
From: Nathan Sidwell @ 2020-11-20 20:24 UTC (permalink / raw)
  To: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 764 bytes --]

This is a reposting of 
https://sourceware.org/pipermail/libc-alpha/2020-November/119822.html 
blessing it with the FB disclaimerness that I have.  IANAL.

Previously, in UCS4 conversion routines we limit the number of
characters we examine to the minimum of the number of characters in the
input and the number of characters in the output. This is not the
correct behavior when __GCONV_IGNORE_ERRORS is set, as we do not consume
an output character when we skip a code unit. Instead, track the input
and output pointers and terminate the loop when either reaches its
limit.

This resolves assertion failures when resetting the input buffer in a 
step of
iconv, which assumes that the input will be fully consumed given sufficient
output space.
-- 
Nathan Sidwell

[-- Attachment #2: colavita.diff --]
[-- Type: text/x-patch, Size: 4205 bytes --]

diff --git a/iconv/Makefile b/iconv/Makefile
index 30bf996d3a..f9b51e23ec 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -44,7 +44,7 @@ CFLAGS-linereader.c += -DNO_TRANSLITERATION
 CFLAGS-simple-hash.c += -I../locale
 
 tests	= tst-iconv1 tst-iconv2 tst-iconv3 tst-iconv4 tst-iconv5 tst-iconv6 \
-	  tst-iconv7 tst-iconv-mt tst-iconv-opt
+	  tst-iconv7 tst-iconv8 tst-iconv-mt tst-iconv-opt
 
 others		= iconv_prog iconvconfig
 install-others-programs	= $(inst_bindir)/iconv
diff --git a/iconv/gconv_simple.c b/iconv/gconv_simple.c
index d4797fba17..963b29f246 100644
--- a/iconv/gconv_simple.c
+++ b/iconv/gconv_simple.c
@@ -239,11 +239,9 @@ ucs4_internal_loop (struct __gconv_step *step,
   int flags = step_data->__flags;
   const unsigned char *inptr = *inptrp;
   unsigned char *outptr = *outptrp;
-  size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
   int result;
-  size_t cnt;
 
-  for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
+  for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
     {
       uint32_t inval;
 
@@ -307,11 +305,9 @@ ucs4_internal_loop_unaligned (struct __gconv_step *step,
   int flags = step_data->__flags;
   const unsigned char *inptr = *inptrp;
   unsigned char *outptr = *outptrp;
-  size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
   int result;
-  size_t cnt;
 
-  for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
+  for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
     {
       if (__glibc_unlikely (inptr[0] > 0x80))
 	{
@@ -613,11 +609,9 @@ ucs4le_internal_loop (struct __gconv_step *step,
   int flags = step_data->__flags;
   const unsigned char *inptr = *inptrp;
   unsigned char *outptr = *outptrp;
-  size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
   int result;
-  size_t cnt;
 
-  for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
+  for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
     {
       uint32_t inval;
 
@@ -684,11 +678,9 @@ ucs4le_internal_loop_unaligned (struct __gconv_step *step,
   int flags = step_data->__flags;
   const unsigned char *inptr = *inptrp;
   unsigned char *outptr = *outptrp;
-  size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
   int result;
-  size_t cnt;
 
-  for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4)
+  for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
     {
       if (__glibc_unlikely (inptr[3] > 0x80))
 	{
diff --git a/iconv/tst-iconv8.c b/iconv/tst-iconv8.c
new file mode 100644
index 0000000000..0b92b19f66
--- /dev/null
+++ b/iconv/tst-iconv8.c
@@ -0,0 +1,50 @@
+/* Test iconv behavior on UCS4 conversions with //IGNORE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Derived from BZ #26923 */
+#include <errno.h>
+#include <iconv.h>
+#include <stdio.h>
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+  iconv_t cd = iconv_open ("UTF-8//IGNORE", "ISO-10646/UCS4/");
+  TEST_VERIFY_EXIT (cd != (iconv_t) -1);
+
+  /*
+   * Convert sequence beginning with an irreversible character into buffer that
+   * is too small.
+   */
+  char input[12] = "\xe1\x80\xa1" "AAAAAAAAA";
+  char *inptr = input;
+  size_t insize = sizeof (input);
+  char output[6];
+  char *outptr = output;
+  size_t outsize = sizeof (output);
+
+  TEST_VERIFY (iconv (cd, &inptr, &insize, &outptr, &outsize) == -1);
+  TEST_VERIFY (errno == E2BIG);
+
+  TEST_VERIFY_EXIT (iconv_close (cd) != -1);
+
+  return 0;
+}
+
+#include <support/test-driver.c>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-12-04  4:04 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-19 18:40 [PATCH] iconv: Fix incorrect UCS4 inner loop bounds (BZ#26923) Michael Colavita
2020-11-24 13:58 ` Andreas Schwab
2020-11-24 19:15   ` Michael Colavita
2020-12-02 16:43     ` Michael Colavita
2020-12-04  4:03     ` Siddhesh Poyarekar
2020-11-20 20:24 Nathan Sidwell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).