* [patch, libgfortran] Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write
@ 2014-03-30 11:05 Jerry DeLisle
2014-04-13 17:48 ` Jerry DeLisle
` (2 more replies)
0 siblings, 3 replies; 6+ messages in thread
From: Jerry DeLisle @ 2014-03-30 11:05 UTC (permalink / raw)
To: gfortran; +Cc: gcc patches
[-- Attachment #1: Type: text/plain, Size: 858 bytes --]
Hi all,
The attached patch fixes namelist read/write and list directed read/write to
support UTF-8.
I have attached a preliminary test case to use to experiment with this. I will
need to set it up for the testsuite still.
Regression tested on x86-64-linux-gnu.
OK for trunk or wait?
Regards,
Jerry
2014-03-29 Jerry DeLisle <jvdelisle@gcc.gnu>
PR libfortran/52539
* io/list_read.c: Add uchar typedef. (push_char4): New function
to save kind=4 character. (next_char_utf8): New function to read
a single UTF-8 encoded character value. (read_chracter): Update
to use the new functions for reading UTF-8 strings.
(list_formatted_read_scalar): Update to handle list directed
reads of UTF-8 strings. (nml_read_obj): Likewise update for
UTF-8 strings in namelists.
* io/write.c (nml_write_obj): Add kind=4 character support for
namelist writes.
[-- Attachment #2: pr52539.diff --]
[-- Type: text/x-patch, Size: 7773 bytes --]
Index: list_read.c
===================================================================
--- list_read.c (revision 208931)
+++ list_read.c (working copy)
@@ -32,7 +32,9 @@ see the files COPYING3 and COPYING.RUNTIME respect
#include <stdlib.h>
#include <ctype.h>
+typedef unsigned char uchar;
+
/* List directed input. Several parsing subroutines are practically
reimplemented from formatted input, the reason being that there are
all kinds of small differences between formatted and list directed
@@ -97,7 +99,38 @@ push_char (st_parameter_dt *dtp, char c)
dtp->u.p.saved_string[dtp->u.p.saved_used++] = c;
}
+/* Save a KIND=4 character to a string buffer, enlarging the buffer
+ as necessary. */
+static void
+push_char4 (st_parameter_dt *dtp, gfc_char4_t c)
+{
+ gfc_char4_t *new, *p = (gfc_char4_t *) dtp->u.p.saved_string;
+
+ if (p == NULL)
+ {
+ dtp->u.p.saved_string = xcalloc (SCRATCH_SIZE, sizeof (gfc_char4_t));
+ dtp->u.p.saved_length = SCRATCH_SIZE;
+ dtp->u.p.saved_used = 0;
+ p = (gfc_char4_t *) dtp->u.p.saved_string;
+ }
+
+ if (dtp->u.p.saved_used >= dtp->u.p.saved_length)
+ {
+ dtp->u.p.saved_length = 2 * dtp->u.p.saved_length;
+ new = realloc (p, dtp->u.p.saved_length);
+ if (new == NULL)
+ generate_error (&dtp->common, LIBERROR_OS, NULL);
+ p = new;
+
+ memset (new + dtp->u.p.saved_used, 0,
+ dtp->u.p.saved_length - dtp->u.p.saved_used);
+ }
+
+ p[dtp->u.p.saved_used++] = c;
+}
+
+
/* Free the input buffer if necessary. */
static void
@@ -247,6 +280,57 @@ done:
}
+static gfc_char4_t
+next_char_utf8 (st_parameter_dt *dtp)
+{
+ static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
+ static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+ int i, nb;
+ gfc_char4_t c;
+
+ c = next_char (dtp);
+ if (c < 0x80)
+ return c;
+
+ /* The number of leading 1-bits in the first byte indicates how many
+ bytes follow. */
+ for (nb = 2; nb < 7; nb++)
+ if ((c & ~masks[nb-1]) == patns[nb-1])
+ goto found;
+ goto invalid;
+
+ found:
+ c = (c & masks[nb-1]);
+
+ /* Decode the bytes read. */
+ for (i = 1; i < nb; i++)
+ {
+ gfc_char4_t n = next_char (dtp);
+
+ if ((n & 0xC0) != 0x80)
+ goto invalid;
+
+ c = ((c << 6) + (n & 0x3F));
+ }
+
+ /* Make sure the shortest possible encoding was used. */
+ if (c <= 0x7F && nb > 1) goto invalid;
+ if (c <= 0x7FF && nb > 2) goto invalid;
+ if (c <= 0xFFFF && nb > 3) goto invalid;
+ if (c <= 0x1FFFFF && nb > 4) goto invalid;
+ if (c <= 0x3FFFFFF && nb > 5) goto invalid;
+
+ /* Make sure the character is valid. */
+ if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF))
+ goto invalid;
+
+ return c;
+
+ invalid:
+ generate_error (&dtp->common, LIBERROR_READ_VALUE, "Invalid UTF-8 encoding");
+ return (gfc_char4_t) '?';
+}
+
/* Push a character back onto the input. */
static void
@@ -1087,51 +1171,98 @@ read_character (st_parameter_dt *dtp, int length _
}
get_string:
- for (;;)
- {
- if ((c = next_char (dtp)) == EOF)
- goto done_eof;
- switch (c)
- {
- case '"':
- case '\'':
- if (c != quote)
- {
+
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ for (;;)
+ {
+ if ((c = next_char_utf8 (dtp)) == EOF)
+ goto done_eof;
+ switch (c)
+ {
+ case '"':
+ case '\'':
+ if (c != quote)
+ {
+ push_char4 (dtp, c);
+ break;
+ }
+
+ /* See if we have a doubled quote character or the end of
+ the string. */
+
+ if ((c = next_char_utf8 (dtp)) == EOF)
+ goto done_eof;
+ if (c == quote)
+ {
+ push_char4 (dtp, quote);
+ break;
+ }
+
+ unget_char (dtp, c);
+ goto done;
+
+ CASE_SEPARATORS:
+ if (quote == ' ')
+ {
+ unget_char (dtp, c);
+ goto done;
+ }
+
+ if (c != '\n' && c != '\r')
+ push_char4 (dtp, c);
+ break;
+
+ default:
+ push_char4 (dtp, c);
+ break;
+ }
+ }
+ else
+ for (;;)
+ {
+ if ((c = next_char (dtp)) == EOF)
+ goto done_eof;
+ switch (c)
+ {
+ case '"':
+ case '\'':
+ if (c != quote)
+ {
+ push_char (dtp, c);
+ break;
+ }
+
+ /* See if we have a doubled quote character or the end of
+ the string. */
+
+ if ((c = next_char (dtp)) == EOF)
+ goto done_eof;
+ if (c == quote)
+ {
+ push_char (dtp, quote);
+ break;
+ }
+
+ unget_char (dtp, c);
+ goto done;
+
+ CASE_SEPARATORS:
+ if (quote == ' ')
+ {
+ unget_char (dtp, c);
+ goto done;
+ }
+
+ if (c != '\n' && c != '\r')
push_char (dtp, c);
- break;
- }
-
- /* See if we have a doubled quote character or the end of
- the string. */
-
- if ((c = next_char (dtp)) == EOF)
- goto done_eof;
- if (c == quote)
- {
- push_char (dtp, quote);
- break;
- }
-
- unget_char (dtp, c);
- goto done;
-
- CASE_SEPARATORS:
- if (quote == ' ')
- {
- unget_char (dtp, c);
- goto done;
- }
-
- if (c != '\n' && c != '\r')
+ break;
+
+ default:
push_char (dtp, c);
- break;
+ break;
+ }
+ }
- default:
- push_char (dtp, c);
- break;
- }
- }
-
/* At this point, we have to have a separator, or else the string is
invalid. */
done:
@@ -1903,7 +2034,7 @@ static int
list_formatted_read_scalar (st_parameter_dt *dtp, bt type, void *p,
int kind, size_t size)
{
- gfc_char4_t *q;
+ gfc_char4_t *q, *r;
int c, i, m;
int err = 0;
@@ -2031,13 +2162,19 @@ list_formatted_read_scalar (st_parameter_dt *dtp,
{
m = ((int) size < dtp->u.p.saved_used)
? (int) size : dtp->u.p.saved_used;
- if (kind == 1)
- memcpy (p, dtp->u.p.saved_string, m);
+
+ q = (gfc_char4_t *) p;
+ r = (gfc_char4_t *) dtp->u.p.saved_string;
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ for (i = 0; i < m; i++)
+ *q++ = *r++;
else
{
- q = (gfc_char4_t *) p;
- for (i = 0; i < m; i++)
- q[i] = (unsigned char) dtp->u.p.saved_string[i];
+ if (kind == 1)
+ memcpy (p, dtp->u.p.saved_string, m);
+ else
+ for (i = 0; i < m; i++)
+ *q++ = (unsigned char) dtp->u.p.saved_string[i];
}
}
else
@@ -2771,10 +2908,27 @@ nml_read_obj (st_parameter_dt *dtp, namelist_info
}
else
m = dtp->u.p.saved_used;
- pdata = (void*)( pdata + clow - 1 );
- memcpy (pdata, dtp->u.p.saved_string, m);
- if (m < dlen)
- memset ((void*)( pdata + m ), ' ', dlen - m);
+
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ {
+ gfc_char4_t *q4, *p4 = pdata;
+ int i;
+
+ q4 = (gfc_char4_t *) dtp->u.p.saved_string;
+ p4 += clow -1;
+ for (i = 0; i < m; i++)
+ *p4++ = *q4++;
+ if (m < dlen)
+ for (i = 0; i < dlen - m; i++)
+ *p4++ = (gfc_char4_t) ' ';
+ }
+ else
+ {
+ pdata = (void*)( pdata + clow - 1 );
+ memcpy (pdata, dtp->u.p.saved_string, m);
+ if (m < dlen)
+ memset ((void*)( pdata + m ), ' ', dlen - m);
+ }
break;
default:
Index: write.c
===================================================================
--- write.c (revision 208931)
+++ write.c (working copy)
@@ -1835,7 +1835,10 @@ nml_write_obj (st_parameter_dt *dtp, namelist_info
break;
case BT_CHARACTER:
- write_character (dtp, p, 1, obj->string_length, DELIM);
+ if (dtp->u.p.current_unit->flags.encoding == ENCODING_UTF8)
+ write_character (dtp, p, 4, obj->string_length, DELIM);
+ else
+ write_character (dtp, p, 1, obj->string_length, DELIM);
break;
case BT_REAL:
[-- Attachment #3: pr52539.f90 --]
[-- Type: text/x-fortran, Size: 1372 bytes --]
character(len=10, kind=4) :: str, str2
character(len=25, kind=4) :: str3
namelist /nml/ str
str = 4_'a'//char (int (z'4F60'),4) &
//char (int (z'597D'), 4)//4_'b'
print *, "Write to terminal just the strings:"
open(6, encoding='utf-8')
write(*, '(1x,a)') 4_'>'//str//4_'<'
write(*, *) 4_'>'//str//4_'<'
print *, "Write to the terminal the namelist:"
write(6,nml=nml)
print *, "Write to Unit 99 the namelist encoded:"
open(99, encoding='utf-8',form='formatted')
write(99, '(3a)') '&nml str = "', str, '" /'
print *, "Write to Unit 99 the encoded string:"
write(99, '(a)') str
print *, " Done!"
rewind(99)
str = 4_'XXXX'
str2 = 4_'YYYY'
read(99,nml=nml)
print *, "Write the value returned by the namelist read:"
write(*, '(a)') 4_'>'//str//4_'<'
read(99, *) str2
print *, "Write the value returned by the list read:"
write(*, *) 4_'>'//str2//4_'<'
print *, "Write the namlist to the terminal:"
write(*,nml=nml)
print *, "Write the string to the terminal, list directed:"
write(*, *) 4_'>'//str2//4_'<'
print *, "Write the string to the terminal, formatted:"
write(*, '(1x,a)') 4_'>'//str2//4_'<'
print *, "Rewind the file and just read one line at a time"
print *, "and write the results to the terminal, list directed:"
rewind(99)
read(99,*) str3
write(*,*) "line1:",str3
read(99,*) str3
write(*,*) "line2:",str3
close(99, status='delete')
end
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [patch, libgfortran] Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write
2014-03-30 11:05 [patch, libgfortran] Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write Jerry DeLisle
@ 2014-04-13 17:48 ` Jerry DeLisle
2014-04-24 3:47 ` Jerry DeLisle
2014-04-24 6:22 ` Tobias Burnus
2014-04-27 12:01 ` Andreas Schwab
2 siblings, 1 reply; 6+ messages in thread
From: Jerry DeLisle @ 2014-04-13 17:48 UTC (permalink / raw)
To: gfortran; +Cc: gcc patches
Ping! This is PR52539
On 03/29/2014 09:58 PM, Jerry DeLisle wrote:
> Hi all,
>
> The attached patch fixes namelist read/write and list directed read/write to
> support UTF-8.
>
> I have attached a preliminary test case to use to experiment with this. I will
> need to set it up for the testsuite still.
>
> Regression tested on x86-64-linux-gnu.
>
> OK for trunk or wait?
>
> Regards,
>
> Jerry
>
> 2014-03-29 Jerry DeLisle <jvdelisle@gcc.gnu>
>
> PR libfortran/52539
> * io/list_read.c: Add uchar typedef. (push_char4): New function
> to save kind=4 character. (next_char_utf8): New function to read
> a single UTF-8 encoded character value. (read_chracter): Update
> to use the new functions for reading UTF-8 strings.
> (list_formatted_read_scalar): Update to handle list directed
> reads of UTF-8 strings. (nml_read_obj): Likewise update for
> UTF-8 strings in namelists.
> * io/write.c (nml_write_obj): Add kind=4 character support for
> namelist writes.
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [patch, libgfortran] Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write
2014-04-13 17:48 ` Jerry DeLisle
@ 2014-04-24 3:47 ` Jerry DeLisle
0 siblings, 0 replies; 6+ messages in thread
From: Jerry DeLisle @ 2014-04-24 3:47 UTC (permalink / raw)
To: gfortran; +Cc: gcc patches
On 04/13/2014 10:48 AM, Jerry DeLisle wrote:
> Ping! This is PR52539
>
Without comment or objections, I will commit this patch this weekend to trunk.
Regards,
Jerry
> On 03/29/2014 09:58 PM, Jerry DeLisle wrote:
>> Hi all,
>>
>> The attached patch fixes namelist read/write and list directed read/write to
>> support UTF-8.
>>
>> I have attached a preliminary test case to use to experiment with this. I will
>> need to set it up for the testsuite still.
>>
>> Regression tested on x86-64-linux-gnu.
>>
>> OK for trunk or wait?
>>
>> Regards,
>>
>> Jerry
>>
>> 2014-03-29 Jerry DeLisle <jvdelisle@gcc.gnu>
>>
>> PR libfortran/52539
>> * io/list_read.c: Add uchar typedef. (push_char4): New function
>> to save kind=4 character. (next_char_utf8): New function to read
>> a single UTF-8 encoded character value. (read_chracter): Update
>> to use the new functions for reading UTF-8 strings.
>> (list_formatted_read_scalar): Update to handle list directed
>> reads of UTF-8 strings. (nml_read_obj): Likewise update for
>> UTF-8 strings in namelists.
>> * io/write.c (nml_write_obj): Add kind=4 character support for
>> namelist writes.
>>
>
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [patch, libgfortran] Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write
2014-03-30 11:05 [patch, libgfortran] Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write Jerry DeLisle
2014-04-13 17:48 ` Jerry DeLisle
@ 2014-04-24 6:22 ` Tobias Burnus
2014-04-27 12:01 ` Andreas Schwab
2 siblings, 0 replies; 6+ messages in thread
From: Tobias Burnus @ 2014-04-24 6:22 UTC (permalink / raw)
To: Jerry DeLisle, gfortran; +Cc: gcc patches
Jerry DeLisle wrote:
> The attached patch fixes namelist read/write and list directed read/write to
> support UTF-8. I have attached a preliminary test case to use to experiment with this. I will
> need to set it up for the testsuite still.
> Regression tested on x86-64-linux-gnu.
> OK for trunk or wait?
Yes, now that GCC 4.10 has started: The patch is okay for the trunk.
However, you should polish the test case to check the result and abort()
when the result gives something unexpected.
Tobias
> 2014-03-29 Jerry DeLisle <jvdelisle@gcc.gnu>
>
> PR libfortran/52539
> * io/list_read.c: Add uchar typedef. (push_char4): New function
> to save kind=4 character. (next_char_utf8): New function to read
> a single UTF-8 encoded character value. (read_chracter): Update
> to use the new functions for reading UTF-8 strings.
> (list_formatted_read_scalar): Update to handle list directed
> reads of UTF-8 strings. (nml_read_obj): Likewise update for
> UTF-8 strings in namelists.
> * io/write.c (nml_write_obj): Add kind=4 character support for
> namelist writes.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [patch, libgfortran] Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write
2014-03-30 11:05 [patch, libgfortran] Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write Jerry DeLisle
2014-04-13 17:48 ` Jerry DeLisle
2014-04-24 6:22 ` Tobias Burnus
@ 2014-04-27 12:01 ` Andreas Schwab
2014-04-27 15:17 ` Jerry DeLisle
2 siblings, 1 reply; 6+ messages in thread
From: Andreas Schwab @ 2014-04-27 12:01 UTC (permalink / raw)
To: Jerry DeLisle; +Cc: gfortran, gcc patches
Jerry DeLisle <jvdelisle@charter.net> writes:
> +static void
> +push_char4 (st_parameter_dt *dtp, gfc_char4_t c)
> +{
> + gfc_char4_t *new, *p = (gfc_char4_t *) dtp->u.p.saved_string;
> +
> + if (p == NULL)
> + {
> + dtp->u.p.saved_string = xcalloc (SCRATCH_SIZE, sizeof (gfc_char4_t));
> + dtp->u.p.saved_length = SCRATCH_SIZE;
> + dtp->u.p.saved_used = 0;
> + p = (gfc_char4_t *) dtp->u.p.saved_string;
> + }
> +
> + if (dtp->u.p.saved_used >= dtp->u.p.saved_length)
> + {
> + dtp->u.p.saved_length = 2 * dtp->u.p.saved_length;
> + new = realloc (p, dtp->u.p.saved_length);
That's a buffer overflow.
Andreas.
--
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 58CA 54C7 6D53 942B 1756 01D3 44D5 214B 8276 4ED5
"And now for something completely different."
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [patch, libgfortran] Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write
2014-04-27 12:01 ` Andreas Schwab
@ 2014-04-27 15:17 ` Jerry DeLisle
0 siblings, 0 replies; 6+ messages in thread
From: Jerry DeLisle @ 2014-04-27 15:17 UTC (permalink / raw)
To: Andreas Schwab; +Cc: gfortran, gcc patches
On 04/27/2014 04:57 AM, Andreas Schwab wrote:
> Jerry DeLisle <jvdelisle@charter.net> writes:
>
>> +static void
>> +push_char4 (st_parameter_dt *dtp, gfc_char4_t c)
>> +{
>> + gfc_char4_t *new, *p = (gfc_char4_t *) dtp->u.p.saved_string;
>> +
>> + if (p == NULL)
>> + {
>> + dtp->u.p.saved_string = xcalloc (SCRATCH_SIZE, sizeof (gfc_char4_t));
>> + dtp->u.p.saved_length = SCRATCH_SIZE;
>> + dtp->u.p.saved_used = 0;
>> + p = (gfc_char4_t *) dtp->u.p.saved_string;
>> + }
>> +
>> + if (dtp->u.p.saved_used >= dtp->u.p.saved_length)
>> + {
>> + dtp->u.p.saved_length = 2 * dtp->u.p.saved_length;
>> + new = realloc (p, dtp->u.p.saved_length);
>
> That's a buffer overflow.
>
Do you mean it should be?
new = realloc (p, dtp->u.p.saved_length * sizeof (gfc_char4_t));
jerry
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2014-04-27 14:43 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-03-30 11:05 [patch, libgfortran] Wrong result for UTF-8/UCS-4 list-directed and namelist read and nml write Jerry DeLisle
2014-04-13 17:48 ` Jerry DeLisle
2014-04-24 3:47 ` Jerry DeLisle
2014-04-24 6:22 ` Tobias Burnus
2014-04-27 12:01 ` Andreas Schwab
2014-04-27 15:17 ` Jerry DeLisle
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).