public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] POSIX locale covers every byte [BZ# 29511]
@ 2022-08-30 18:19 наб
  2022-09-06 14:06 ` [PATCH v2] " наб
  2022-09-06 14:19 ` [PATCH] " Florian Weimer
  0 siblings, 2 replies; 29+ messages in thread
From: наб @ 2022-08-30 18:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: Florian Weimer

[-- Attachment #1: Type: text/plain, Size: 28907 bytes --]

This is a trivial patch, largely duplicating the extant ASCII code

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively:
  (a) is 1-byte, stateless, and contains 256 characters
  (b) which collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDF00> we land at the tail-end of the
Unicode Low Surrogate Area at DC00-DFFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
and match musl

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
Please observe that this patch is NOT correct for s390:
the s390 assembly implementations, marked // TODO,
are copied verbatim from the ASCII ones

I lack the hardware and expertise to write them,
but all others in that file are in assembly, too;
should I just copy the bare implementations there?
Please advise.

CCing Florian since he commented on my bugzilla bug :)

 iconv/gconv_builtin.h                 |   8 +
 iconv/gconv_int.h                     |   7 +
 iconv/gconv_simple.c                  |  75 +++++++
 iconv/tst-iconv_prog.sh               |  43 ++++
 inet/tst-idna_name_classify.c         |   6 +-
 locale/tst-C-locale.c                 |  69 ++++++
 localedata/locales/POSIX              | 143 +++++++++++-
 stdio-common/tst-printf-bz25691.c     |   2 +
 sysdeps/s390/multiarch/gconv_simple.c | 298 ++++++++++++++++++++++++++
 wcsmbs/wcsmbsload.c                   |  10 +-
 10 files changed, 652 insertions(+), 9 deletions(-)

diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 68c2369b1f..cd1805b3ce 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 1c6745043e..45ab1edfad 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -299,6 +301,11 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c);
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_simple.c b/iconv/gconv_simple.c
index 640068d9ba..4cd01854cd 100644
--- a/iconv/gconv_simple.c
+++ b/iconv/gconv_simple.c
@@ -53,6 +53,18 @@ __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c)
     return WEOF;
 }
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdf00 + c;
+}
+
 
 /* Transform from the internal, UCS4-like format, to UCS4.  The
    difference between the internal ucs4 format and the real UCS4
@@ -868,6 +880,69 @@ ucs4le_internal_loop_single (struct __gconv_step *step,
 #include <iconv/skeleton.c>
 
 
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	if (__glibc_unlikely (val > 0x7f))				      \
+	  val -= 0xdf00;						      \
+	*outptr++ = val;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
 /* Convert from the internal (UCS4-like) format to UTF-8.  */
 #define DEFINE_INIT		0
 #define DEFINE_FINI		0
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index b3d8bf5110..a24d8d2207 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdf\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bfd34eee31..b379481844 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index 6bd0367069..f30396ae12 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -229,6 +229,75 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+#define CONVTEST(b, v) \
+  {									      \
+    unsigned char bs[] = {b, 0};					      \
+    mbstate_t ctx = {};							      \
+    wchar_t wc = -1;							      \
+    size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);			      \
+    if (sz != !!b)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) width in locale %s wrong "		      \
+		"(is %zd, should be %d)\n", *bs, locname, sz, !!b);	      \
+	result = 1;							      \
+      }									      \
+    if (wc != v)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) value in locale %s wrong "		      \
+		"(is %x, should be %x)\n", *bs, locname, wc, v);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    CONVTEST(i, i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    CONVTEST(i, 0xdf00 + i);
+
+#define DECONVTEST(v, b) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != 1)							      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be 1)\n", v, locname, sz);		      \
+	result = 1;							      \
+      }									      \
+    if (ob != b)							      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be %hhx)\n", v, locname, ob, b);	      \
+	result = 1;							      \
+      }									      \
+  }
+#define DECONVERR(v) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != (size_t) -1)						      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be (size_t )-1)\n", v, locname, sz);	      \
+	result = 1;							      \
+      }									      \
+    if (ob != (unsigned char) -1)					      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be unchanged)\n", v, locname, ob);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    DECONVTEST(i, i);
+  for(int i = 0x80; i < 0xdf00; ++i)
+    DECONVERR(i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    DECONVTEST(0xdf00 + i, i);
+  for(int i = 0xe000; i <= 0xffff; ++i)
+    DECONVERR(i);
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..fc34a6abc1 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the end of the Low Surrogate Area to contain these,
+% yielding [<UDF80>, <UDFFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDF80>
+<UDF81>
+<UDF82>
+<UDF83>
+<UDF84>
+<UDF85>
+<UDF86>
+<UDF87>
+<UDF88>
+<UDF89>
+<UDF8A>
+<UDF8B>
+<UDF8C>
+<UDF8D>
+<UDF8E>
+<UDF8F>
+<UDF90>
+<UDF91>
+<UDF92>
+<UDF93>
+<UDF94>
+<UDF95>
+<UDF96>
+<UDF97>
+<UDF98>
+<UDF99>
+<UDF9A>
+<UDF9B>
+<UDF9C>
+<UDF9D>
+<UDF9E>
+<UDF9F>
+<UDFA0>
+<UDFA1>
+<UDFA2>
+<UDFA3>
+<UDFA4>
+<UDFA5>
+<UDFA6>
+<UDFA7>
+<UDFA8>
+<UDFA9>
+<UDFAA>
+<UDFAB>
+<UDFAC>
+<UDFAD>
+<UDFAE>
+<UDFAF>
+<UDFB0>
+<UDFB1>
+<UDFB2>
+<UDFB3>
+<UDFB4>
+<UDFB5>
+<UDFB6>
+<UDFB7>
+<UDFB8>
+<UDFB9>
+<UDFBA>
+<UDFBB>
+<UDFBC>
+<UDFBD>
+<UDFBE>
+<UDFBF>
+<UDFC0>
+<UDFC1>
+<UDFC2>
+<UDFC3>
+<UDFC4>
+<UDFC5>
+<UDFC6>
+<UDFC7>
+<UDFC8>
+<UDFC9>
+<UDFCA>
+<UDFCB>
+<UDFCC>
+<UDFCD>
+<UDFCE>
+<UDFCF>
+<UDFD0>
+<UDFD1>
+<UDFD2>
+<UDFD3>
+<UDFD4>
+<UDFD5>
+<UDFD6>
+<UDFD7>
+<UDFD8>
+<UDFD9>
+<UDFDA>
+<UDFDB>
+<UDFDC>
+<UDFDD>
+<UDFDE>
+<UDFDF>
+<UDFE0>
+<UDFE1>
+<UDFE2>
+<UDFE3>
+<UDFE4>
+<UDFE5>
+<UDFE6>
+<UDFE7>
+<UDFE8>
+<UDFE9>
+<UDFEA>
+<UDFEB>
+<UDFEC>
+<UDFED>
+<UDFEE>
+<UDFEF>
+<UDFF0>
+<UDFF1>
+<UDFF2>
+<UDFF3>
+<UDFF4>
+<UDFF5>
+<UDFF6>
+<UDFF7>
+<UDFF8>
+<UDFF9>
+<UDFFA>
+<UDFFB>
+<UDFFC>
+<UDFFD>
+<UDFFE>
+<UDFFF>
 order_end
 %
 END LC_COLLATE
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index 44844e71c3..e66242b58f 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/sysdeps/s390/multiarch/gconv_simple.c b/sysdeps/s390/multiarch/gconv_simple.c
index 41132f620a..3896bdd96a 100644
--- a/sysdeps/s390/multiarch/gconv_simple.c
+++ b/sysdeps/s390/multiarch/gconv_simple.c
@@ -68,6 +68,8 @@
 
 # undef __gconv_transform_ascii_internal
 # undef __gconv_transform_internal_ascii
+# undef __gconv_transform_posix_internal
+# undef __gconv_transform_internal_posix
 # undef __gconv_transform_internal_ucs4le
 # undef __gconv_transform_ucs4_internal
 # undef __gconv_transform_ucs4le_internal
@@ -385,6 +387,302 @@ ICONV_VX_IFUNC (__gconv_transform_ascii_internal)
 # undef BODY_ORIG_ERROR
 ICONV_VX_IFUNC (__gconv_transform_internal_ascii)
 
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+# define DEFINE_INIT		0
+# define DEFINE_FINI		0
+# define MIN_NEEDED_FROM	1
+# define MIN_NEEDED_TO		4
+# define FROM_DIRECTION		1
+# define FROM_LOOP		ICONV_VX_NAME (posix_internal_loop)
+# define TO_LOOP		ICONV_VX_NAME (posix_internal_loop) /* This is not used.  */
+# define FUNCTION_NAME		ICONV_VX_NAME (__gconv_transform_posix_internal)
+# define ONE_DIRECTION		1
+
+# define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+# define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+# define LOOPFCT		FROM_LOOP
+
+# define BODY_ORIG \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+# define BODY								\
+  {									\
+    size_t len = inend - inptr;						\ TODO: entirely ascii_internal_loop, above
+    if (len > (outend - outptr) / 4)					\
+      len = (outend - outptr) / 4;					\
+    size_t loop_count, tmp;						\
+    __asm__ volatile (".machine push\n\t"				\
+		      ".machine \"z13\"\n\t"				\
+		      ".machinemode \"zarch_nohighgprs\"\n\t"		\
+		      CONVERT_32BIT_SIZE_T ([R_LEN])			\
+		      "    vrepib %%v30,0x7f\n\t" /* For compare > 0x7f.  */ \
+		      "    srlg %[R_LI],%[R_LEN],4\n\t"			\
+		      "    vrepib %%v31,0x20\n\t"			\
+		      "    clgije %[R_LI],0,1f\n\t"			\
+		      "0:  \n\t" /* Handle 16-byte blocks.  */		\
+		      "    vl %%v16,0(%[R_IN])\n\t"			\
+		      /* Checking for values > 0x7f.  */		\
+		      "    vstrcbs %%v17,%%v16,%%v30,%%v31\n\t"		\
+		      "    jno 10f\n\t"					\
+		      /* Enlarge to UCS4.  */				\
+		      "    vuplhb %%v17,%%v16\n\t"			\
+		      "    vupllb %%v18,%%v16\n\t"			\
+		      "    vuplhh %%v19,%%v17\n\t"			\
+		      "    vupllh %%v20,%%v17\n\t"			\
+		      "    vuplhh %%v21,%%v18\n\t"			\
+		      "    vupllh %%v22,%%v18\n\t"			\
+		      /* Store 64bytes to buf_out.  */			\
+		      "    vstm %%v19,%%v22,0(%[R_OUT])\n\t"		\
+		      "    la %[R_IN],16(%[R_IN])\n\t"			\
+		      "    la %[R_OUT],64(%[R_OUT])\n\t"		\
+		      "    brctg %[R_LI],0b\n\t"			\
+		      "    lghi %[R_LI],15\n\t"				\
+		      "    ngr %[R_LEN],%[R_LI]\n\t"			\
+		      "    je 20f\n\t" /* Jump away if no remaining bytes.  */ \
+		      /* Handle remaining bytes.  */			\
+		      "1: aghik %[R_LI],%[R_LEN],-1\n\t"		\
+		      "    jl 20f\n\t" /* Jump away if no remaining bytes.  */ \
+		      "    vll %%v16,%[R_LI],0(%[R_IN])\n\t"		\
+		      /* Checking for values > 0x7f.  */		\
+		      "    vstrcbs %%v17,%%v16,%%v30,%%v31\n\t"		\
+		      "    vlgvb %[R_TMP],%%v17,7\n\t"			\
+		      "    clr %[R_TMP],%[R_LI]\n\t"			\
+		      "    locrh %[R_TMP],%[R_LEN]\n\t"			\
+		      "    locghih %[R_LEN],0\n\t"			\
+		      "    j 12f\n\t"					\
+		      "10:\n\t"						\
+		      /* Found a value > 0x7f.				\
+			 Store the preceding chars.  */			\
+		      "    vlgvb %[R_TMP],%%v17,7\n\t"			\
+		      "12: la %[R_IN],0(%[R_TMP],%[R_IN])\n\t"		\
+		      "    sllk %[R_TMP],%[R_TMP],2\n\t"		\
+		      "    ahi %[R_TMP],-1\n\t"				\
+		      "    jl 20f\n\t"					\
+		      "    lgr %[R_LI],%[R_TMP]\n\t"			\
+		      "    vuplhb %%v17,%%v16\n\t"			\
+		      "    vuplhh %%v19,%%v17\n\t"			\
+		      "    vstl %%v19,%[R_LI],0(%[R_OUT])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 11f\n\t"					\
+		      "    vupllh %%v20,%%v17\n\t"			\
+		      "    vstl %%v20,%[R_LI],16(%[R_OUT])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 11f\n\t"					\
+		      "    vupllb %%v18,%%v16\n\t"			\
+		      "    vuplhh %%v21,%%v18\n\t"			\
+		      "    vstl %%v21,%[R_LI],32(%[R_OUT])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 11f\n\t"					\
+		      "    vupllh %%v22,%%v18\n\t"			\
+		      "    vstl %%v22,%[R_LI],48(%[R_OUT])\n\t"		\
+		      "11:\n\t"						\
+		      "    la %[R_OUT],1(%[R_TMP],%[R_OUT])\n\t"	\
+		      "20:\n\t"						\
+		      ".machine pop"					\
+		      : /* outputs */ [R_OUT] "+a" (outptr)		\
+			, [R_IN] "+a" (inptr)				\
+			, [R_LEN] "+d" (len)				\
+			, [R_LI] "=d" (loop_count)			\
+			, [R_TMP] "=a" (tmp)				\
+		      : /* inputs */					\
+		      : /* clobber list*/ "memory", "cc"		\
+			ASM_CLOBBER_VR ("v16") ASM_CLOBBER_VR ("v17")	\
+			ASM_CLOBBER_VR ("v18") ASM_CLOBBER_VR ("v19")	\
+			ASM_CLOBBER_VR ("v20") ASM_CLOBBER_VR ("v21")	\
+			ASM_CLOBBER_VR ("v22") ASM_CLOBBER_VR ("v30")	\
+			ASM_CLOBBER_VR ("v31")				\
+		      );						\
+    if (len > 0)							\
+      {									\
+	/* Found an invalid character at the next input byte.  */	\
+	BODY_ORIG_ERROR							\
+      }									\
+  }
+
+# include <iconv/loop.c>
+# include <iconv/skeleton.c>
+# undef BODY_ORIG
+# undef BODY_ORIG_ERROR
+ICONV_VX_IFUNC (__gconv_transform_posix_internal)
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+# define DEFINE_INIT		0
+# define DEFINE_FINI		0
+# define MIN_NEEDED_FROM	4
+# define MIN_NEEDED_TO		1
+# define FROM_DIRECTION		1
+# define FROM_LOOP		ICONV_VX_NAME (internal_posix_loop)
+# define TO_LOOP		ICONV_VX_NAME (internal_posix_loop) /* This is not used.  */
+# define FUNCTION_NAME		ICONV_VX_NAME (__gconv_transform_internal_posix)
+# define ONE_DIRECTION		1
+
+# define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+# define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+# define LOOPFCT		FROM_LOOP
+# define BODY_ORIG_ERROR						\
+  UNICODE_TAG_HANDLER (*((const uint32_t *) inptr), 4);			\
+  STANDARD_TO_LOOP_ERR_HANDLER (4);
+
+# define BODY_ORIG \
+  {									\
+    uint32_t val = *((const uint32_t *) inptr);				\
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))\
+      {									\
+	UNICODE_TAG_HANDLER (val, 4);					\
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				\
+      }									\
+    else								\
+      {									\
+	if (__glibc_unlikely (val > 0x7f))				\
+	  val -= 0xdf00;						\
+	*outptr++ = val;						\
+	inptr += sizeof (uint32_t);					\
+      }									\
+  }
+
+# define BODY								\
+  {									\
+    size_t len = (inend - inptr) / 4;					\ TODO: entirely internal_ascii_loop, above
+    if (len > outend - outptr)						\
+      len = outend - outptr;						\
+    size_t loop_count, tmp, tmp2;					\
+    __asm__ volatile (".machine push\n\t"				\
+		      ".machine \"z13\"\n\t"				\
+		      ".machinemode \"zarch_nohighgprs\"\n\t"		\
+		      CONVERT_32BIT_SIZE_T ([R_LEN])			\
+		      /* Setup to check for ch > 0x7f.  */		\
+		      "    vzero %%v21\n\t"				\
+		      "    srlg %[R_LI],%[R_LEN],4\n\t"			\
+		      "    vleih %%v21,8192,0\n\t"  /* element 0:   >  */ \
+		      "    vleih %%v21,-8192,2\n\t" /* element 1: =<>  */ \
+		      "    vleif %%v20,127,0\n\t"   /* element 0: 127  */ \
+		      "    lghi %[R_TMP],0\n\t"				\
+		      "    clgije %[R_LI],0,1f\n\t"			\
+		      "0:\n\t"						\
+		      "    vlm %%v16,%%v19,0(%[R_IN])\n\t"		\
+		      /* Shorten to byte values.  */			\
+		      "    vpkf %%v23,%%v16,%%v17\n\t"			\
+		      "    vpkf %%v24,%%v18,%%v19\n\t"			\
+		      "    vpkh %%v23,%%v23,%%v24\n\t"			\
+		      /* Checking for values > 0x7f.  */		\
+		      "    vstrcfs %%v22,%%v16,%%v20,%%v21\n\t"		\
+		      "    jno 10f\n\t"					\
+		      "    vstrcfs %%v22,%%v17,%%v20,%%v21\n\t"		\
+		      "    jno 11f\n\t"					\
+		      "    vstrcfs %%v22,%%v18,%%v20,%%v21\n\t"		\
+		      "    jno 12f\n\t"					\
+		      "    vstrcfs %%v22,%%v19,%%v20,%%v21\n\t"		\
+		      "    jno 13f\n\t"					\
+		      /* Store 16bytes to outptr.  */			\
+		      "    vst %%v23,0(%[R_OUT])\n\t"			\
+		      "    la %[R_IN],64(%[R_IN])\n\t"			\
+		      "    la %[R_OUT],16(%[R_OUT])\n\t"		\
+		      "    brctg %[R_LI],0b\n\t"			\
+		      "    lghi %[R_LI],15\n\t"				\
+		      "    ngr %[R_LEN],%[R_LI]\n\t"			\
+		      "    je 20f\n\t" /* Jump away if no remaining bytes.  */ \
+		      /* Handle remaining bytes.  */			\
+		      "1: sllg %[R_LI],%[R_LEN],2\n\t"			\
+		      "    aghi %[R_LI],-1\n\t"				\
+		      "    jl 20f\n\t" /* Jump away if no remaining bytes.  */ \
+		      /* Load remaining 1...63 bytes.  */		\
+		      "    vll %%v16,%[R_LI],0(%[R_IN])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 2f\n\t"					\
+		      "    vll %%v17,%[R_LI],16(%[R_IN])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 2f\n\t"					\
+		      "    vll %%v18,%[R_LI],32(%[R_IN])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 2f\n\t"					\
+		      "    vll %%v19,%[R_LI],48(%[R_IN])\n\t"		\
+		      "2:\n\t"						\
+		      /* Shorten to byte values.  */			\
+		      "    vpkf %%v23,%%v16,%%v17\n\t"			\
+		      "    vpkf %%v24,%%v18,%%v19\n\t"			\
+		      "    vpkh %%v23,%%v23,%%v24\n\t"			\
+		      "    sllg %[R_LI],%[R_LEN],2\n\t"			\
+		      "    aghi %[R_LI],-16\n\t"			\
+		      "    jl 3f\n\t" /* v16 is not fully loaded.  */	\
+		      "    vstrcfs %%v22,%%v16,%%v20,%%v21\n\t"		\
+		      "    jno 10f\n\t"					\
+		      "    aghi %[R_LI],-16\n\t"			\
+		      "    jl 4f\n\t" /* v17 is not fully loaded.  */	\
+		      "    vstrcfs %%v22,%%v17,%%v20,%%v21\n\t"		\
+		      "    jno 11f\n\t"					\
+		      "    aghi %[R_LI],-16\n\t"			\
+		      "    jl 5f\n\t" /* v18 is not fully loaded.  */	\
+		      "    vstrcfs %%v22,%%v18,%%v20,%%v21\n\t"		\
+		      "    jno 12f\n\t"					\
+		      "    aghi %[R_LI],-16\n\t"			\
+		      /* v19 is not fully loaded. */			\
+		      "    lghi %[R_TMP],12\n\t"			\
+		      "    vstrcfs %%v22,%%v19,%%v20,%%v21\n\t"		\
+		      "6: vlgvb %[R_I],%%v22,7\n\t"			\
+		      "    aghi %[R_LI],16\n\t"				\
+		      "    clrjl %[R_I],%[R_LI],14f\n\t"		\
+		      "    lgr %[R_I],%[R_LEN]\n\t"			\
+		      "    lghi %[R_LEN],0\n\t"				\
+		      "    j 15f\n\t"					\
+		      "3: vstrcfs %%v22,%%v16,%%v20,%%v21\n\t"		\
+		      "    j 6b\n\t"					\
+		      "4: vstrcfs %%v22,%%v17,%%v20,%%v21\n\t"		\
+		      "    lghi %[R_TMP],4\n\t"				\
+		      "    j 6b\n\t"					\
+		      "5: vstrcfs %%v22,%%v17,%%v20,%%v21\n\t"		\
+		      "    lghi %[R_TMP],8\n\t"				\
+		      "    j 6b\n\t"					\
+		      /* Found a value > 0x7f.  */			\
+		      "13: ahi %[R_TMP],4\n\t"				\
+		      "12: ahi %[R_TMP],4\n\t"				\
+		      "11: ahi %[R_TMP],4\n\t"				\
+		      "10: vlgvb %[R_I],%%v22,7\n\t"			\
+		      "14: srlg %[R_I],%[R_I],2\n\t"			\
+		      "    agr %[R_I],%[R_TMP]\n\t"			\
+		      "    je 20f\n\t"					\
+		      /* Store characters before invalid one...  */	\
+		      "15: aghi %[R_I],-1\n\t"				\
+		      "    vstl %%v23,%[R_I],0(%[R_OUT])\n\t"		\
+		      /* ... and update pointers.  */			\
+		      "    la %[R_OUT],1(%[R_I],%[R_OUT])\n\t"		\
+		      "    sllg %[R_I],%[R_I],2\n\t"			\
+		      "    la %[R_IN],4(%[R_I],%[R_IN])\n\t"		\
+		      "20:\n\t"						\
+		      ".machine pop"					\
+		      : /* outputs */ [R_OUT] "+a" (outptr)		\
+			, [R_IN] "+a" (inptr)				\
+			, [R_LEN] "+d" (len)				\
+			, [R_LI] "=d" (loop_count)			\
+			, [R_I] "=a" (tmp2)				\
+			, [R_TMP] "=d" (tmp)				\
+		      : /* inputs */					\
+		      : /* clobber list*/ "memory", "cc"		\
+			ASM_CLOBBER_VR ("v16") ASM_CLOBBER_VR ("v17")	\
+			ASM_CLOBBER_VR ("v18") ASM_CLOBBER_VR ("v19")	\
+			ASM_CLOBBER_VR ("v20") ASM_CLOBBER_VR ("v21")	\
+			ASM_CLOBBER_VR ("v22") ASM_CLOBBER_VR ("v23")	\
+			ASM_CLOBBER_VR ("v24")				\
+		      );						\
+    if (len > 0)							\
+      {									\
+	/* Found an invalid character > 0x7f at next character.  */	\
+	BODY_ORIG_ERROR							\
+      }									\
+  }
+# define LOOP_NEED_FLAGS
+# include <iconv/loop.c>
+# include <iconv/skeleton.c>
+# undef BODY_ORIG
+# undef BODY_ORIG_ERROR
+ICONV_VX_IFUNC (__gconv_transform_internal_posix)
+
 
 /* Convert from internal UCS4 to UCS4 little endian form.  */
 # define DEFINE_INIT		0
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 0f0f55f9ed..f87099bcf5 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v2] POSIX locale covers every byte [BZ# 29511]
  2022-08-30 18:19 [PATCH] POSIX locale covers every byte [BZ# 29511] наб
@ 2022-09-06 14:06 ` наб
  2022-09-06 14:19 ` [PATCH] " Florian Weimer
  1 sibling, 0 replies; 29+ messages in thread
From: наб @ 2022-09-06 14:06 UTC (permalink / raw)
  To: libc-alpha; +Cc: Florian Weimer

[-- Attachment #1: Type: text/plain, Size: 28591 bytes --]

This is a trivial patch, largely duplicating the extant ASCII code

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDF00> we land at the tail-end of the
Unicode Low Surrogate Area at DC00-DFFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
and match musl

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
v2: rebased, no changes, resending after a week per guidelines

 iconv/gconv_builtin.h                 |   8 +
 iconv/gconv_int.h                     |   7 +
 iconv/gconv_simple.c                  |  75 +++++++
 iconv/tst-iconv_prog.sh               |  43 ++++
 inet/tst-idna_name_classify.c         |   6 +-
 locale/tst-C-locale.c                 |  69 ++++++
 localedata/locales/POSIX              | 143 +++++++++++-
 stdio-common/tst-printf-bz25691.c     |   2 +
 sysdeps/s390/multiarch/gconv_simple.c | 298 ++++++++++++++++++++++++++
 wcsmbs/wcsmbsload.c                   |  10 +-
 10 files changed, 652 insertions(+), 9 deletions(-)

diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 68c2369b1f..cd1805b3ce 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 1c6745043e..45ab1edfad 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -299,6 +301,11 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c);
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_simple.c b/iconv/gconv_simple.c
index 640068d9ba..4cd01854cd 100644
--- a/iconv/gconv_simple.c
+++ b/iconv/gconv_simple.c
@@ -53,6 +53,18 @@ __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c)
     return WEOF;
 }
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdf00 + c;
+}
+
 
 /* Transform from the internal, UCS4-like format, to UCS4.  The
    difference between the internal ucs4 format and the real UCS4
@@ -868,6 +880,69 @@ ucs4le_internal_loop_single (struct __gconv_step *step,
 #include <iconv/skeleton.c>
 
 
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	if (__glibc_unlikely (val > 0x7f))				      \
+	  val -= 0xdf00;						      \
+	*outptr++ = val;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
 /* Convert from the internal (UCS4-like) format to UTF-8.  */
 #define DEFINE_INIT		0
 #define DEFINE_FINI		0
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index b3d8bf5110..a24d8d2207 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdf\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bfd34eee31..b379481844 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index 6bd0367069..f30396ae12 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -229,6 +229,75 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+#define CONVTEST(b, v) \
+  {									      \
+    unsigned char bs[] = {b, 0};					      \
+    mbstate_t ctx = {};							      \
+    wchar_t wc = -1;							      \
+    size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);			      \
+    if (sz != !!b)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) width in locale %s wrong "		      \
+		"(is %zd, should be %d)\n", *bs, locname, sz, !!b);	      \
+	result = 1;							      \
+      }									      \
+    if (wc != v)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) value in locale %s wrong "		      \
+		"(is %x, should be %x)\n", *bs, locname, wc, v);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    CONVTEST(i, i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    CONVTEST(i, 0xdf00 + i);
+
+#define DECONVTEST(v, b) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != 1)							      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be 1)\n", v, locname, sz);		      \
+	result = 1;							      \
+      }									      \
+    if (ob != b)							      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be %hhx)\n", v, locname, ob, b);	      \
+	result = 1;							      \
+      }									      \
+  }
+#define DECONVERR(v) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != (size_t) -1)						      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be (size_t )-1)\n", v, locname, sz);	      \
+	result = 1;							      \
+      }									      \
+    if (ob != (unsigned char) -1)					      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be unchanged)\n", v, locname, ob);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    DECONVTEST(i, i);
+  for(int i = 0x80; i < 0xdf00; ++i)
+    DECONVERR(i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    DECONVTEST(0xdf00 + i, i);
+  for(int i = 0xe000; i <= 0xffff; ++i)
+    DECONVERR(i);
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..fc34a6abc1 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the end of the Low Surrogate Area to contain these,
+% yielding [<UDF80>, <UDFFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDF80>
+<UDF81>
+<UDF82>
+<UDF83>
+<UDF84>
+<UDF85>
+<UDF86>
+<UDF87>
+<UDF88>
+<UDF89>
+<UDF8A>
+<UDF8B>
+<UDF8C>
+<UDF8D>
+<UDF8E>
+<UDF8F>
+<UDF90>
+<UDF91>
+<UDF92>
+<UDF93>
+<UDF94>
+<UDF95>
+<UDF96>
+<UDF97>
+<UDF98>
+<UDF99>
+<UDF9A>
+<UDF9B>
+<UDF9C>
+<UDF9D>
+<UDF9E>
+<UDF9F>
+<UDFA0>
+<UDFA1>
+<UDFA2>
+<UDFA3>
+<UDFA4>
+<UDFA5>
+<UDFA6>
+<UDFA7>
+<UDFA8>
+<UDFA9>
+<UDFAA>
+<UDFAB>
+<UDFAC>
+<UDFAD>
+<UDFAE>
+<UDFAF>
+<UDFB0>
+<UDFB1>
+<UDFB2>
+<UDFB3>
+<UDFB4>
+<UDFB5>
+<UDFB6>
+<UDFB7>
+<UDFB8>
+<UDFB9>
+<UDFBA>
+<UDFBB>
+<UDFBC>
+<UDFBD>
+<UDFBE>
+<UDFBF>
+<UDFC0>
+<UDFC1>
+<UDFC2>
+<UDFC3>
+<UDFC4>
+<UDFC5>
+<UDFC6>
+<UDFC7>
+<UDFC8>
+<UDFC9>
+<UDFCA>
+<UDFCB>
+<UDFCC>
+<UDFCD>
+<UDFCE>
+<UDFCF>
+<UDFD0>
+<UDFD1>
+<UDFD2>
+<UDFD3>
+<UDFD4>
+<UDFD5>
+<UDFD6>
+<UDFD7>
+<UDFD8>
+<UDFD9>
+<UDFDA>
+<UDFDB>
+<UDFDC>
+<UDFDD>
+<UDFDE>
+<UDFDF>
+<UDFE0>
+<UDFE1>
+<UDFE2>
+<UDFE3>
+<UDFE4>
+<UDFE5>
+<UDFE6>
+<UDFE7>
+<UDFE8>
+<UDFE9>
+<UDFEA>
+<UDFEB>
+<UDFEC>
+<UDFED>
+<UDFEE>
+<UDFEF>
+<UDFF0>
+<UDFF1>
+<UDFF2>
+<UDFF3>
+<UDFF4>
+<UDFF5>
+<UDFF6>
+<UDFF7>
+<UDFF8>
+<UDFF9>
+<UDFFA>
+<UDFFB>
+<UDFFC>
+<UDFFD>
+<UDFFE>
+<UDFFF>
 order_end
 %
 END LC_COLLATE
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index 44844e71c3..e66242b58f 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/sysdeps/s390/multiarch/gconv_simple.c b/sysdeps/s390/multiarch/gconv_simple.c
index 41132f620a..3896bdd96a 100644
--- a/sysdeps/s390/multiarch/gconv_simple.c
+++ b/sysdeps/s390/multiarch/gconv_simple.c
@@ -68,6 +68,8 @@
 
 # undef __gconv_transform_ascii_internal
 # undef __gconv_transform_internal_ascii
+# undef __gconv_transform_posix_internal
+# undef __gconv_transform_internal_posix
 # undef __gconv_transform_internal_ucs4le
 # undef __gconv_transform_ucs4_internal
 # undef __gconv_transform_ucs4le_internal
@@ -385,6 +387,302 @@ ICONV_VX_IFUNC (__gconv_transform_ascii_internal)
 # undef BODY_ORIG_ERROR
 ICONV_VX_IFUNC (__gconv_transform_internal_ascii)
 
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+# define DEFINE_INIT		0
+# define DEFINE_FINI		0
+# define MIN_NEEDED_FROM	1
+# define MIN_NEEDED_TO		4
+# define FROM_DIRECTION		1
+# define FROM_LOOP		ICONV_VX_NAME (posix_internal_loop)
+# define TO_LOOP		ICONV_VX_NAME (posix_internal_loop) /* This is not used.  */
+# define FUNCTION_NAME		ICONV_VX_NAME (__gconv_transform_posix_internal)
+# define ONE_DIRECTION		1
+
+# define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+# define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+# define LOOPFCT		FROM_LOOP
+
+# define BODY_ORIG \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+# define BODY								\
+  {									\
+    size_t len = inend - inptr;						\ TODO: entirely ascii_internal_loop, above
+    if (len > (outend - outptr) / 4)					\
+      len = (outend - outptr) / 4;					\
+    size_t loop_count, tmp;						\
+    __asm__ volatile (".machine push\n\t"				\
+		      ".machine \"z13\"\n\t"				\
+		      ".machinemode \"zarch_nohighgprs\"\n\t"		\
+		      CONVERT_32BIT_SIZE_T ([R_LEN])			\
+		      "    vrepib %%v30,0x7f\n\t" /* For compare > 0x7f.  */ \
+		      "    srlg %[R_LI],%[R_LEN],4\n\t"			\
+		      "    vrepib %%v31,0x20\n\t"			\
+		      "    clgije %[R_LI],0,1f\n\t"			\
+		      "0:  \n\t" /* Handle 16-byte blocks.  */		\
+		      "    vl %%v16,0(%[R_IN])\n\t"			\
+		      /* Checking for values > 0x7f.  */		\
+		      "    vstrcbs %%v17,%%v16,%%v30,%%v31\n\t"		\
+		      "    jno 10f\n\t"					\
+		      /* Enlarge to UCS4.  */				\
+		      "    vuplhb %%v17,%%v16\n\t"			\
+		      "    vupllb %%v18,%%v16\n\t"			\
+		      "    vuplhh %%v19,%%v17\n\t"			\
+		      "    vupllh %%v20,%%v17\n\t"			\
+		      "    vuplhh %%v21,%%v18\n\t"			\
+		      "    vupllh %%v22,%%v18\n\t"			\
+		      /* Store 64bytes to buf_out.  */			\
+		      "    vstm %%v19,%%v22,0(%[R_OUT])\n\t"		\
+		      "    la %[R_IN],16(%[R_IN])\n\t"			\
+		      "    la %[R_OUT],64(%[R_OUT])\n\t"		\
+		      "    brctg %[R_LI],0b\n\t"			\
+		      "    lghi %[R_LI],15\n\t"				\
+		      "    ngr %[R_LEN],%[R_LI]\n\t"			\
+		      "    je 20f\n\t" /* Jump away if no remaining bytes.  */ \
+		      /* Handle remaining bytes.  */			\
+		      "1: aghik %[R_LI],%[R_LEN],-1\n\t"		\
+		      "    jl 20f\n\t" /* Jump away if no remaining bytes.  */ \
+		      "    vll %%v16,%[R_LI],0(%[R_IN])\n\t"		\
+		      /* Checking for values > 0x7f.  */		\
+		      "    vstrcbs %%v17,%%v16,%%v30,%%v31\n\t"		\
+		      "    vlgvb %[R_TMP],%%v17,7\n\t"			\
+		      "    clr %[R_TMP],%[R_LI]\n\t"			\
+		      "    locrh %[R_TMP],%[R_LEN]\n\t"			\
+		      "    locghih %[R_LEN],0\n\t"			\
+		      "    j 12f\n\t"					\
+		      "10:\n\t"						\
+		      /* Found a value > 0x7f.				\
+			 Store the preceding chars.  */			\
+		      "    vlgvb %[R_TMP],%%v17,7\n\t"			\
+		      "12: la %[R_IN],0(%[R_TMP],%[R_IN])\n\t"		\
+		      "    sllk %[R_TMP],%[R_TMP],2\n\t"		\
+		      "    ahi %[R_TMP],-1\n\t"				\
+		      "    jl 20f\n\t"					\
+		      "    lgr %[R_LI],%[R_TMP]\n\t"			\
+		      "    vuplhb %%v17,%%v16\n\t"			\
+		      "    vuplhh %%v19,%%v17\n\t"			\
+		      "    vstl %%v19,%[R_LI],0(%[R_OUT])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 11f\n\t"					\
+		      "    vupllh %%v20,%%v17\n\t"			\
+		      "    vstl %%v20,%[R_LI],16(%[R_OUT])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 11f\n\t"					\
+		      "    vupllb %%v18,%%v16\n\t"			\
+		      "    vuplhh %%v21,%%v18\n\t"			\
+		      "    vstl %%v21,%[R_LI],32(%[R_OUT])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 11f\n\t"					\
+		      "    vupllh %%v22,%%v18\n\t"			\
+		      "    vstl %%v22,%[R_LI],48(%[R_OUT])\n\t"		\
+		      "11:\n\t"						\
+		      "    la %[R_OUT],1(%[R_TMP],%[R_OUT])\n\t"	\
+		      "20:\n\t"						\
+		      ".machine pop"					\
+		      : /* outputs */ [R_OUT] "+a" (outptr)		\
+			, [R_IN] "+a" (inptr)				\
+			, [R_LEN] "+d" (len)				\
+			, [R_LI] "=d" (loop_count)			\
+			, [R_TMP] "=a" (tmp)				\
+		      : /* inputs */					\
+		      : /* clobber list*/ "memory", "cc"		\
+			ASM_CLOBBER_VR ("v16") ASM_CLOBBER_VR ("v17")	\
+			ASM_CLOBBER_VR ("v18") ASM_CLOBBER_VR ("v19")	\
+			ASM_CLOBBER_VR ("v20") ASM_CLOBBER_VR ("v21")	\
+			ASM_CLOBBER_VR ("v22") ASM_CLOBBER_VR ("v30")	\
+			ASM_CLOBBER_VR ("v31")				\
+		      );						\
+    if (len > 0)							\
+      {									\
+	/* Found an invalid character at the next input byte.  */	\
+	BODY_ORIG_ERROR							\
+      }									\
+  }
+
+# include <iconv/loop.c>
+# include <iconv/skeleton.c>
+# undef BODY_ORIG
+# undef BODY_ORIG_ERROR
+ICONV_VX_IFUNC (__gconv_transform_posix_internal)
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+# define DEFINE_INIT		0
+# define DEFINE_FINI		0
+# define MIN_NEEDED_FROM	4
+# define MIN_NEEDED_TO		1
+# define FROM_DIRECTION		1
+# define FROM_LOOP		ICONV_VX_NAME (internal_posix_loop)
+# define TO_LOOP		ICONV_VX_NAME (internal_posix_loop) /* This is not used.  */
+# define FUNCTION_NAME		ICONV_VX_NAME (__gconv_transform_internal_posix)
+# define ONE_DIRECTION		1
+
+# define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+# define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+# define LOOPFCT		FROM_LOOP
+# define BODY_ORIG_ERROR						\
+  UNICODE_TAG_HANDLER (*((const uint32_t *) inptr), 4);			\
+  STANDARD_TO_LOOP_ERR_HANDLER (4);
+
+# define BODY_ORIG \
+  {									\
+    uint32_t val = *((const uint32_t *) inptr);				\
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))\
+      {									\
+	UNICODE_TAG_HANDLER (val, 4);					\
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				\
+      }									\
+    else								\
+      {									\
+	if (__glibc_unlikely (val > 0x7f))				\
+	  val -= 0xdf00;						\
+	*outptr++ = val;						\
+	inptr += sizeof (uint32_t);					\
+      }									\
+  }
+
+# define BODY								\
+  {									\
+    size_t len = (inend - inptr) / 4;					\ TODO: entirely internal_ascii_loop, above
+    if (len > outend - outptr)						\
+      len = outend - outptr;						\
+    size_t loop_count, tmp, tmp2;					\
+    __asm__ volatile (".machine push\n\t"				\
+		      ".machine \"z13\"\n\t"				\
+		      ".machinemode \"zarch_nohighgprs\"\n\t"		\
+		      CONVERT_32BIT_SIZE_T ([R_LEN])			\
+		      /* Setup to check for ch > 0x7f.  */		\
+		      "    vzero %%v21\n\t"				\
+		      "    srlg %[R_LI],%[R_LEN],4\n\t"			\
+		      "    vleih %%v21,8192,0\n\t"  /* element 0:   >  */ \
+		      "    vleih %%v21,-8192,2\n\t" /* element 1: =<>  */ \
+		      "    vleif %%v20,127,0\n\t"   /* element 0: 127  */ \
+		      "    lghi %[R_TMP],0\n\t"				\
+		      "    clgije %[R_LI],0,1f\n\t"			\
+		      "0:\n\t"						\
+		      "    vlm %%v16,%%v19,0(%[R_IN])\n\t"		\
+		      /* Shorten to byte values.  */			\
+		      "    vpkf %%v23,%%v16,%%v17\n\t"			\
+		      "    vpkf %%v24,%%v18,%%v19\n\t"			\
+		      "    vpkh %%v23,%%v23,%%v24\n\t"			\
+		      /* Checking for values > 0x7f.  */		\
+		      "    vstrcfs %%v22,%%v16,%%v20,%%v21\n\t"		\
+		      "    jno 10f\n\t"					\
+		      "    vstrcfs %%v22,%%v17,%%v20,%%v21\n\t"		\
+		      "    jno 11f\n\t"					\
+		      "    vstrcfs %%v22,%%v18,%%v20,%%v21\n\t"		\
+		      "    jno 12f\n\t"					\
+		      "    vstrcfs %%v22,%%v19,%%v20,%%v21\n\t"		\
+		      "    jno 13f\n\t"					\
+		      /* Store 16bytes to outptr.  */			\
+		      "    vst %%v23,0(%[R_OUT])\n\t"			\
+		      "    la %[R_IN],64(%[R_IN])\n\t"			\
+		      "    la %[R_OUT],16(%[R_OUT])\n\t"		\
+		      "    brctg %[R_LI],0b\n\t"			\
+		      "    lghi %[R_LI],15\n\t"				\
+		      "    ngr %[R_LEN],%[R_LI]\n\t"			\
+		      "    je 20f\n\t" /* Jump away if no remaining bytes.  */ \
+		      /* Handle remaining bytes.  */			\
+		      "1: sllg %[R_LI],%[R_LEN],2\n\t"			\
+		      "    aghi %[R_LI],-1\n\t"				\
+		      "    jl 20f\n\t" /* Jump away if no remaining bytes.  */ \
+		      /* Load remaining 1...63 bytes.  */		\
+		      "    vll %%v16,%[R_LI],0(%[R_IN])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 2f\n\t"					\
+		      "    vll %%v17,%[R_LI],16(%[R_IN])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 2f\n\t"					\
+		      "    vll %%v18,%[R_LI],32(%[R_IN])\n\t"		\
+		      "    ahi %[R_LI],-16\n\t"				\
+		      "    jl 2f\n\t"					\
+		      "    vll %%v19,%[R_LI],48(%[R_IN])\n\t"		\
+		      "2:\n\t"						\
+		      /* Shorten to byte values.  */			\
+		      "    vpkf %%v23,%%v16,%%v17\n\t"			\
+		      "    vpkf %%v24,%%v18,%%v19\n\t"			\
+		      "    vpkh %%v23,%%v23,%%v24\n\t"			\
+		      "    sllg %[R_LI],%[R_LEN],2\n\t"			\
+		      "    aghi %[R_LI],-16\n\t"			\
+		      "    jl 3f\n\t" /* v16 is not fully loaded.  */	\
+		      "    vstrcfs %%v22,%%v16,%%v20,%%v21\n\t"		\
+		      "    jno 10f\n\t"					\
+		      "    aghi %[R_LI],-16\n\t"			\
+		      "    jl 4f\n\t" /* v17 is not fully loaded.  */	\
+		      "    vstrcfs %%v22,%%v17,%%v20,%%v21\n\t"		\
+		      "    jno 11f\n\t"					\
+		      "    aghi %[R_LI],-16\n\t"			\
+		      "    jl 5f\n\t" /* v18 is not fully loaded.  */	\
+		      "    vstrcfs %%v22,%%v18,%%v20,%%v21\n\t"		\
+		      "    jno 12f\n\t"					\
+		      "    aghi %[R_LI],-16\n\t"			\
+		      /* v19 is not fully loaded. */			\
+		      "    lghi %[R_TMP],12\n\t"			\
+		      "    vstrcfs %%v22,%%v19,%%v20,%%v21\n\t"		\
+		      "6: vlgvb %[R_I],%%v22,7\n\t"			\
+		      "    aghi %[R_LI],16\n\t"				\
+		      "    clrjl %[R_I],%[R_LI],14f\n\t"		\
+		      "    lgr %[R_I],%[R_LEN]\n\t"			\
+		      "    lghi %[R_LEN],0\n\t"				\
+		      "    j 15f\n\t"					\
+		      "3: vstrcfs %%v22,%%v16,%%v20,%%v21\n\t"		\
+		      "    j 6b\n\t"					\
+		      "4: vstrcfs %%v22,%%v17,%%v20,%%v21\n\t"		\
+		      "    lghi %[R_TMP],4\n\t"				\
+		      "    j 6b\n\t"					\
+		      "5: vstrcfs %%v22,%%v17,%%v20,%%v21\n\t"		\
+		      "    lghi %[R_TMP],8\n\t"				\
+		      "    j 6b\n\t"					\
+		      /* Found a value > 0x7f.  */			\
+		      "13: ahi %[R_TMP],4\n\t"				\
+		      "12: ahi %[R_TMP],4\n\t"				\
+		      "11: ahi %[R_TMP],4\n\t"				\
+		      "10: vlgvb %[R_I],%%v22,7\n\t"			\
+		      "14: srlg %[R_I],%[R_I],2\n\t"			\
+		      "    agr %[R_I],%[R_TMP]\n\t"			\
+		      "    je 20f\n\t"					\
+		      /* Store characters before invalid one...  */	\
+		      "15: aghi %[R_I],-1\n\t"				\
+		      "    vstl %%v23,%[R_I],0(%[R_OUT])\n\t"		\
+		      /* ... and update pointers.  */			\
+		      "    la %[R_OUT],1(%[R_I],%[R_OUT])\n\t"		\
+		      "    sllg %[R_I],%[R_I],2\n\t"			\
+		      "    la %[R_IN],4(%[R_I],%[R_IN])\n\t"		\
+		      "20:\n\t"						\
+		      ".machine pop"					\
+		      : /* outputs */ [R_OUT] "+a" (outptr)		\
+			, [R_IN] "+a" (inptr)				\
+			, [R_LEN] "+d" (len)				\
+			, [R_LI] "=d" (loop_count)			\
+			, [R_I] "=a" (tmp2)				\
+			, [R_TMP] "=d" (tmp)				\
+		      : /* inputs */					\
+		      : /* clobber list*/ "memory", "cc"		\
+			ASM_CLOBBER_VR ("v16") ASM_CLOBBER_VR ("v17")	\
+			ASM_CLOBBER_VR ("v18") ASM_CLOBBER_VR ("v19")	\
+			ASM_CLOBBER_VR ("v20") ASM_CLOBBER_VR ("v21")	\
+			ASM_CLOBBER_VR ("v22") ASM_CLOBBER_VR ("v23")	\
+			ASM_CLOBBER_VR ("v24")				\
+		      );						\
+    if (len > 0)							\
+      {									\
+	/* Found an invalid character > 0x7f at next character.  */	\
+	BODY_ORIG_ERROR							\
+      }									\
+  }
+# define LOOP_NEED_FLAGS
+# include <iconv/loop.c>
+# include <iconv/skeleton.c>
+# undef BODY_ORIG
+# undef BODY_ORIG_ERROR
+ICONV_VX_IFUNC (__gconv_transform_internal_posix)
+
 
 /* Convert from internal UCS4 to UCS4 little endian form.  */
 # define DEFINE_INIT		0
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 0f0f55f9ed..f87099bcf5 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] POSIX locale covers every byte [BZ# 29511]
  2022-08-30 18:19 [PATCH] POSIX locale covers every byte [BZ# 29511] наб
  2022-09-06 14:06 ` [PATCH v2] " наб
@ 2022-09-06 14:19 ` Florian Weimer
  2022-09-06 18:06   ` наб
  1 sibling, 1 reply; 29+ messages in thread
From: Florian Weimer @ 2022-09-06 14:19 UTC (permalink / raw)
  To: наб via Libc-alpha; +Cc: наб

* наб via Libc-alpha:

> This is a trivial patch, largely duplicating the extant ASCII code
>
> There are two user-facing changes:
>   * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
>   * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b
>
> Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively:
>   (a) is 1-byte, stateless, and contains 256 characters
>   (b) which collate in byte order
>   (c) the first 128 characters are equivalent to ASCII (like previous)
> cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
> changes to the standard;
> in short, this means that mbrtowc() must never fail and must return
>   b if b <= 0x7F else ab+c for all bytes b
>   where c is some constant >=0x80
>     and a is a positive integer constant
>
> By strategically picking c=<UDF00> we land at the tail-end of the
> Unicode Low Surrogate Area at DC00-DFFF, described as
>   > Isolated surrogate code points have no interpretation;
>   > consequently, no character code charts or names lists
>   > are provided for this range.
> and match musl

We don't match Python and its surrogateescape encoding (PEP 838).  It
maps invalid bytes in the 0x80…0xff range to U+DC80…U+DCFF.  It may make
more sense to align with that.

What worries me is that this effectively closes the door for using UTF-8
(or some variant, such as Python's) with the C locale.  I used to assume
that POSIX allows that, but they now say this was just a mistake.

Anyway, regarding mechanics, we'll need a new localedata/charmaps/POSIX
charmap, I think.  This charmap then can be tested against the gconv
converter.

You should put the new converters into a separate file (not
iconv/gconv_simple.c), then the s390x version will use that
automatically.
> diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
> index 7ec7f1c577..fc34a6abc1 100644
> --- a/localedata/locales/POSIX
> +++ b/localedata/locales/POSIX
> @@ -97,6 +97,20 @@ END LC_CTYPE
>  LC_COLLATE
>  % This is the POSIX Locale definition for the LC_COLLATE category.

Isn't this just the C locale?  We don't have a separate file for that.

> diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
> index 0f0f55f9ed..f87099bcf5 100644
> --- a/wcsmbs/wcsmbsload.c
> +++ b/wcsmbs/wcsmbsload.c
> @@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
>    .__shlib_handle = NULL,
>    .__modname = NULL,
>    .__counter = INT_MAX,
> -  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
> +  .__from_name = (char *) "POSIX",
>    .__to_name = (char *) "INTERNAL",
> -  .__fct = __gconv_transform_ascii_internal,
> -  .__btowc_fct = __gconv_btwoc_ascii,
> +  .__fct = __gconv_transform_posix_internal,
> +  .__btowc_fct = __gconv_btwoc_posix,
>    .__init_fct = NULL,
>    .__end_fct = NULL,
>    .__min_needed_from = 1,
> @@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
>    .__modname = NULL,
>    .__counter = INT_MAX,
>    .__from_name = (char *) "INTERNAL",
> -  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
> -  .__fct = __gconv_transform_internal_ascii,
> +  .__to_name = (char *) "POSIX",
> +  .__fct = __gconv_transform_internal_posix,
>    .__btowc_fct = NULL,
>    .__init_fct = NULL,
>    .__end_fct = NULL,

This makes the comment on __wcsmbs_gconv_fcts_c in the same file
obsolete.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] POSIX locale covers every byte [BZ# 29511]
  2022-09-06 14:19 ` [PATCH] " Florian Weimer
@ 2022-09-06 18:06   ` наб
  2022-09-06 18:10     ` [PATCH v3 1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format наб
  2022-09-06 18:11     ` [PATCH v3 " наб
  0 siblings, 2 replies; 29+ messages in thread
From: наб @ 2022-09-06 18:06 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha

[-- Attachment #1: Type: text/plain, Size: 5588 bytes --]

Hi!

On Tue, Sep 06, 2022 at 04:19:01PM +0200, Florian Weimer wrote:
> * наб via Libc-alpha:
> 
> > This is a trivial patch, largely duplicating the extant ASCII code
> >
> > There are two user-facing changes:
> >   * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
> >   * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b
> >
> > Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively:
> >   (a) is 1-byte, stateless, and contains 256 characters
> >   (b) which collate in byte order
> >   (c) the first 128 characters are equivalent to ASCII (like previous)
> > cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
> > changes to the standard;
> > in short, this means that mbrtowc() must never fail and must return
> >   b if b <= 0x7F else ab+c for all bytes b
> >   where c is some constant >=0x80
> >     and a is a positive integer constant
> >
> > By strategically picking c=<UDF00> we land at the tail-end of the
> > Unicode Low Surrogate Area at DC00-DFFF, described as
> >   > Isolated surrogate code points have no interpretation;
> >   > consequently, no character code charts or names lists
> >   > are provided for this range.
> > and match musl
> 
> We don't match Python and its surrogateescape encoding (PEP 838).
404?

> It
> maps invalid bytes in the 0x80…0xff range to U+DC80…U+DCFF.
(The same as musl.)

> It may make
> more sense to align with that.
With a=1 and c=<UDF00>, assuming it's as you say, we very much do?

$ printf '\x80\xff' | output/elf/ld.so --library-path output/ output/iconv/iconv_prog -fPOSIX -tUCS4 | hd
00000000  00 00 df 80 00 00 df ff                           |........|
00000008

> Anyway, regarding mechanics, we'll need a new localedata/charmaps/POSIX
> charmap, I think.  This charmap then can be tested against the gconv
> converter.
Hm, the problem with that is tst-tables -> tst-table -> tst-table-from
(and -to) convert by constructing a UTF-8 sequence. The problem with
this approach is that glibc rejects unpaired surrogates.

The output for tst-table-from UTF-8 is:
  ...
  0xED9FBE        0xD7FE
  0xED9FBF        0xD7FF
  0xEE8080        0xE000
  0xEE8081        0xE001
  ...
i.e. there's a gap for the surrogates; and, indeed, the charmap reads
  <UD7FB>     /xed/x9f/xbb HANGUL JONGSEONG PHIEUPH-THIEUTH
  %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
  %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
  %<UDB80>     /xed/xae/x80 <Private Use High Surrogate, First>
  %<UDBFF>     /xed/xaf/xbf <Private Use High Surrogate, Last>
  %<UDC00>     /xed/xb0/x80 <Low Surrogate, First>
  %<UDFFF>     /xed/xbf/xbf <Low Surrogate, Last>
  <UE000>..<UE03F> /xee/x80/x80 <Private Use>
with the surrogate range commented-out;
this dates back to the inclusion of UTF-8 generator scripts in 2015
(4a4839c94a4c93ffc0d5b95c69a08b02a57007f2), these exclusions are
deliberate (grep for surrog in localedata/unicode-gen/utf8_gen.py).

Given this limitation, expanding the charmap to
ANSI_X3.4-1968 + <UDF80>..<UDFFF> doesn't actually test much:
having them as separate codepoints will always fail tests,
and dot-notation lines are ignored when generating the comparison
tables, so this particular type of test just proves that POSIX is the
same as ANSI_X3.4-1968 for the first 128 characters.

There's already an exhaustive iconv_prog-based testsuite
(cf. additions to iconv/tst-iconv_prog.sh), though.

> You should put the new converters into a separate file (not
> iconv/gconv_simple.c), then the s390x version will use that
> automatically.
Oh, of course! Moved to iconv/gconv_posix.c.

> > diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
> > index 7ec7f1c577..fc34a6abc1 100644
> > --- a/localedata/locales/POSIX
> > +++ b/localedata/locales/POSIX
> > @@ -97,6 +97,20 @@ END LC_CTYPE
> >  LC_COLLATE
> >  % This is the POSIX Locale definition for the LC_COLLATE category.
> 
> Isn't this just the C locale?
Yes, C is defined to be POSIX.

> We don't have a separate file for that.
Yes, we very obviously do, seeing as this patch edits it?
Nothing consumes it AFAICT, but.

> > diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
> > index 0f0f55f9ed..f87099bcf5 100644
> > --- a/wcsmbs/wcsmbsload.c
> > +++ b/wcsmbs/wcsmbsload.c
> > @@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
> >    .__shlib_handle = NULL,
> >    .__modname = NULL,
> >    .__counter = INT_MAX,
> > -  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
> > +  .__from_name = (char *) "POSIX",
> >    .__to_name = (char *) "INTERNAL",
> > -  .__fct = __gconv_transform_ascii_internal,
> > -  .__btowc_fct = __gconv_btwoc_ascii,
> > +  .__fct = __gconv_transform_posix_internal,
> > +  .__btowc_fct = __gconv_btwoc_posix,
> >    .__init_fct = NULL,
> >    .__end_fct = NULL,
> >    .__min_needed_from = 1,
> > @@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
> >    .__modname = NULL,
> >    .__counter = INT_MAX,
> >    .__from_name = (char *) "INTERNAL",
> > -  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
> > -  .__fct = __gconv_transform_internal_ascii,
> > +  .__to_name = (char *) "POSIX",
> > +  .__fct = __gconv_transform_internal_posix,
> >    .__btowc_fct = NULL,
> >    .__init_fct = NULL,
> >    .__end_fct = NULL,
> 
> This makes the comment on __wcsmbs_gconv_fcts_c in the same file
> obsolete.

Comment fixed.

> Thanks,
> Florian

New patchset in followup.

Best,
наб

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v3 1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format
  2022-09-06 18:06   ` наб
@ 2022-09-06 18:10     ` наб
  2022-09-14  2:39       ` [PATCH v4 " наб
  2022-09-14  2:39       ` [PATCH v4 " наб
  2022-09-06 18:11     ` [PATCH v3 " наб
  1 sibling, 2 replies; 29+ messages in thread
From: наб @ 2022-09-06 18:10 UTC (permalink / raw)
  To: libc-alpha; +Cc: Florian Weimer

[-- Attachment #1: Type: text/plain, Size: 3117 bytes --]

This "Old POSIX/DKUUG borrowed format" handling is original to the file
and doesn't seem to have ever been used, i.e. id/t-t-c doesn't seem to
have ever been called with argv[1] == POSIX

Upcoming is a POSIX charmap, which would inadvertently trigger this:
clear the way

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
 iconvdata/gb18030.c            |  2 +-
 iconvdata/tst-table-charmap.sh | 11 +----------
 iconvdata/tst-table.sh         |  2 +-
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/iconvdata/gb18030.c b/iconvdata/gb18030.c
index ab9373cd12..bb93b55361 100644
--- a/iconvdata/gb18030.c
+++ b/iconvdata/gb18030.c
@@ -32,7 +32,7 @@
 /* The tables in this file are generated from the charmap.
    In a first step, the charmap is converted to a simple table format:
 
-      ./tst-table-charmap.sh GB18030 < ../localedata/charmaps/GB18030 \
+      ./tst-table-charmap.sh < ../localedata/charmaps/GB18030 \
       > GB18030.table
 */
 
diff --git a/iconvdata/tst-table-charmap.sh b/iconvdata/tst-table-charmap.sh
index 0e5369aa38..36959a02dd 100755
--- a/iconvdata/tst-table-charmap.sh
+++ b/iconvdata/tst-table-charmap.sh
@@ -22,13 +22,4 @@
 LC_ALL=C
 export LC_ALL
 
-case "$1" in
-  POSIX )
-    # Old POSIX/DKUUG borrowed format
-    grep '^<.*>.*/x[0-9A-Fa-f]*[ 	]*<U....>.*$' | grep -v 'not a real character' | sed -e 's,^<.*>[ 	]*\([/x0-9A-Fa-f]*\)[ 	]*<U\(....\)>.*$,\1	0x\2,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
-    ;;
-  *)
-    # New Unicode based format
-    sed -e 's,^%IRREVERSIBLE%,,' | grep '^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*>[ 	]*/x' | grep -v 'not a real character' | sed -e 's,<U\(....\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' -e 's,<U0*\([1-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
-    ;;
-esac
+sed -e 's,^%IRREVERSIBLE%,,' | grep '^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*>[ 	]*/x' | grep -v 'not a real character' | sed -e 's,<U\(....\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' -e 's,<U0*\([1-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
diff --git a/iconvdata/tst-table.sh b/iconvdata/tst-table.sh
index f63ab1d8ee..36005c5448 100755
--- a/iconvdata/tst-table.sh
+++ b/iconvdata/tst-table.sh
@@ -33,7 +33,7 @@ export LC_ALL
 set -e
 
 # Get the charmap.
-./tst-table-charmap.sh ${charmap:-$charset} \
+./tst-table-charmap.sh \
   < ../localedata/charmaps/${charmap:-$charset} \
   > ${objpfx}tst-${charset}.charmap.table
 # When the charset is GB18030, truncate this table because for this encoding,
-- 
2.30.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v3 2/2] POSIX locale covers every byte [BZ# 29511]
  2022-09-06 18:06   ` наб
  2022-09-06 18:10     ` [PATCH v3 1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format наб
@ 2022-09-06 18:11     ` наб
  1 sibling, 0 replies; 29+ messages in thread
From: наб @ 2022-09-06 18:11 UTC (permalink / raw)
  To: libc-alpha; +Cc: Florian Weimer

[-- Attachment #1: Type: text/plain, Size: 23746 bytes --]

This is a trivial patch, largely duplicating the extant ASCII code

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDF00> we land at the tail-end of the
Unicode Low Surrogate Area at DC00-DFFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
and match musl

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
new in v3: POSIX charset, __wcsmbs_gconv_fcts_c comment fixed

 iconv/Makefile                    |   2 +-
 iconv/gconv_builtin.h             |   8 ++
 iconv/gconv_int.h                 |   7 ++
 iconv/gconv_posix.c               |  96 ++++++++++++++++++++
 iconv/tst-iconv_prog.sh           |  43 +++++++++
 iconvdata/tst-tables.sh           |   1 +
 inet/tst-idna_name_classify.c     |   6 +-
 locale/tst-C-locale.c             |  69 ++++++++++++++
 localedata/charmaps/POSIX         | 136 ++++++++++++++++++++++++++++
 localedata/locales/POSIX          | 143 +++++++++++++++++++++++++++++-
 stdio-common/tst-printf-bz25691.c |   2 +
 wcsmbs/wcsmbsload.c               |  14 +--
 12 files changed, 516 insertions(+), 11 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/iconv/Makefile b/iconv/Makefile
index a0d90cfeac..6e926f53e3 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 68c2369b1f..cd1805b3ce 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 1c6745043e..45ab1edfad 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -299,6 +301,11 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c);
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..dcb13fbb43
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,96 @@
+/* Simple transformations functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdf00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	if (__glibc_unlikely (val > 0x7f))				      \
+	  val -= 0xdf00;						      \
+	*outptr++ = val;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index b3d8bf5110..a24d8d2207 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdf\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index 4207b44175..33a02158ac 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bfd34eee31..b379481844 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index 6bd0367069..f30396ae12 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -229,6 +229,75 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+#define CONVTEST(b, v) \
+  {									      \
+    unsigned char bs[] = {b, 0};					      \
+    mbstate_t ctx = {};							      \
+    wchar_t wc = -1;							      \
+    size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);			      \
+    if (sz != !!b)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) width in locale %s wrong "		      \
+		"(is %zd, should be %d)\n", *bs, locname, sz, !!b);	      \
+	result = 1;							      \
+      }									      \
+    if (wc != v)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) value in locale %s wrong "		      \
+		"(is %x, should be %x)\n", *bs, locname, wc, v);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    CONVTEST(i, i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    CONVTEST(i, 0xdf00 + i);
+
+#define DECONVTEST(v, b) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != 1)							      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be 1)\n", v, locname, sz);		      \
+	result = 1;							      \
+      }									      \
+    if (ob != b)							      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be %hhx)\n", v, locname, ob, b);	      \
+	result = 1;							      \
+      }									      \
+  }
+#define DECONVERR(v) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != (size_t) -1)						      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be (size_t )-1)\n", v, locname, sz);	      \
+	result = 1;							      \
+      }									      \
+    if (ob != (unsigned char) -1)					      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be unchanged)\n", v, locname, ob);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    DECONVTEST(i, i);
+  for(int i = 0x80; i < 0xdf00; ++i)
+    DECONVERR(i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    DECONVTEST(0xdf00 + i, i);
+  for(int i = 0xe000; i <= 0xffff; ++i)
+    DECONVERR(i);
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..c44007ff49
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDF80>..<UDFFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..fc34a6abc1 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the end of the Low Surrogate Area to contain these,
+% yielding [<UDF80>, <UDFFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDF80>
+<UDF81>
+<UDF82>
+<UDF83>
+<UDF84>
+<UDF85>
+<UDF86>
+<UDF87>
+<UDF88>
+<UDF89>
+<UDF8A>
+<UDF8B>
+<UDF8C>
+<UDF8D>
+<UDF8E>
+<UDF8F>
+<UDF90>
+<UDF91>
+<UDF92>
+<UDF93>
+<UDF94>
+<UDF95>
+<UDF96>
+<UDF97>
+<UDF98>
+<UDF99>
+<UDF9A>
+<UDF9B>
+<UDF9C>
+<UDF9D>
+<UDF9E>
+<UDF9F>
+<UDFA0>
+<UDFA1>
+<UDFA2>
+<UDFA3>
+<UDFA4>
+<UDFA5>
+<UDFA6>
+<UDFA7>
+<UDFA8>
+<UDFA9>
+<UDFAA>
+<UDFAB>
+<UDFAC>
+<UDFAD>
+<UDFAE>
+<UDFAF>
+<UDFB0>
+<UDFB1>
+<UDFB2>
+<UDFB3>
+<UDFB4>
+<UDFB5>
+<UDFB6>
+<UDFB7>
+<UDFB8>
+<UDFB9>
+<UDFBA>
+<UDFBB>
+<UDFBC>
+<UDFBD>
+<UDFBE>
+<UDFBF>
+<UDFC0>
+<UDFC1>
+<UDFC2>
+<UDFC3>
+<UDFC4>
+<UDFC5>
+<UDFC6>
+<UDFC7>
+<UDFC8>
+<UDFC9>
+<UDFCA>
+<UDFCB>
+<UDFCC>
+<UDFCD>
+<UDFCE>
+<UDFCF>
+<UDFD0>
+<UDFD1>
+<UDFD2>
+<UDFD3>
+<UDFD4>
+<UDFD5>
+<UDFD6>
+<UDFD7>
+<UDFD8>
+<UDFD9>
+<UDFDA>
+<UDFDB>
+<UDFDC>
+<UDFDD>
+<UDFDE>
+<UDFDF>
+<UDFE0>
+<UDFE1>
+<UDFE2>
+<UDFE3>
+<UDFE4>
+<UDFE5>
+<UDFE6>
+<UDFE7>
+<UDFE8>
+<UDFE9>
+<UDFEA>
+<UDFEB>
+<UDFEC>
+<UDFED>
+<UDFEE>
+<UDFEF>
+<UDFF0>
+<UDFF1>
+<UDFF2>
+<UDFF3>
+<UDFF4>
+<UDFF5>
+<UDFF6>
+<UDFF7>
+<UDFF8>
+<UDFF9>
+<UDFFA>
+<UDFFB>
+<UDFFC>
+<UDFFD>
+<UDFFE>
+<UDFFF>
 order_end
 %
 END LC_COLLATE
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index 44844e71c3..e66242b58f 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 0f0f55f9ed..97de9afd25 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDF00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v4 1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format
  2022-09-06 18:10     ` [PATCH v3 1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format наб
@ 2022-09-14  2:39       ` наб
  2022-09-21 14:01         ` [PATCH v5 " наб
  2022-09-21 14:01         ` [PATCH v5 " наб
  2022-09-14  2:39       ` [PATCH v4 " наб
  1 sibling, 2 replies; 29+ messages in thread
From: наб @ 2022-09-14  2:39 UTC (permalink / raw)
  To: libc-alpha; +Cc: Florian Weimer

[-- Attachment #1: Type: text/plain, Size: 3181 bytes --]

This "Old POSIX/DKUUG borrowed format" handling is original to the file
and doesn't seem to have ever been used, i.e. id/t-t-c doesn't seem to
have ever been called with argv[1] == POSIX

Upcoming is a POSIX charmap, which would inadvertently trigger this:
clear the way

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
new in v4: nothing

Resending after a week, per guideline.

 iconvdata/gb18030.c            |  2 +-
 iconvdata/tst-table-charmap.sh | 11 +----------
 iconvdata/tst-table.sh         |  2 +-
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/iconvdata/gb18030.c b/iconvdata/gb18030.c
index ab9373cd12..bb93b55361 100644
--- a/iconvdata/gb18030.c
+++ b/iconvdata/gb18030.c
@@ -32,7 +32,7 @@
 /* The tables in this file are generated from the charmap.
    In a first step, the charmap is converted to a simple table format:
 
-      ./tst-table-charmap.sh GB18030 < ../localedata/charmaps/GB18030 \
+      ./tst-table-charmap.sh < ../localedata/charmaps/GB18030 \
       > GB18030.table
 */
 
diff --git a/iconvdata/tst-table-charmap.sh b/iconvdata/tst-table-charmap.sh
index 0e5369aa38..36959a02dd 100755
--- a/iconvdata/tst-table-charmap.sh
+++ b/iconvdata/tst-table-charmap.sh
@@ -22,13 +22,4 @@
 LC_ALL=C
 export LC_ALL
 
-case "$1" in
-  POSIX )
-    # Old POSIX/DKUUG borrowed format
-    grep '^<.*>.*/x[0-9A-Fa-f]*[ 	]*<U....>.*$' | grep -v 'not a real character' | sed -e 's,^<.*>[ 	]*\([/x0-9A-Fa-f]*\)[ 	]*<U\(....\)>.*$,\1	0x\2,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
-    ;;
-  *)
-    # New Unicode based format
-    sed -e 's,^%IRREVERSIBLE%,,' | grep '^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*>[ 	]*/x' | grep -v 'not a real character' | sed -e 's,<U\(....\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' -e 's,<U0*\([1-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
-    ;;
-esac
+sed -e 's,^%IRREVERSIBLE%,,' | grep '^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*>[ 	]*/x' | grep -v 'not a real character' | sed -e 's,<U\(....\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' -e 's,<U0*\([1-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
diff --git a/iconvdata/tst-table.sh b/iconvdata/tst-table.sh
index f63ab1d8ee..36005c5448 100755
--- a/iconvdata/tst-table.sh
+++ b/iconvdata/tst-table.sh
@@ -33,7 +33,7 @@ export LC_ALL
 set -e
 
 # Get the charmap.
-./tst-table-charmap.sh ${charmap:-$charset} \
+./tst-table-charmap.sh \
   < ../localedata/charmaps/${charmap:-$charset} \
   > ${objpfx}tst-${charset}.charmap.table
 # When the charset is GB18030, truncate this table because for this encoding,
-- 
2.30.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v4 2/2] POSIX locale covers every byte [BZ# 29511]
  2022-09-06 18:10     ` [PATCH v3 1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format наб
  2022-09-14  2:39       ` [PATCH v4 " наб
@ 2022-09-14  2:39       ` наб
  1 sibling, 0 replies; 29+ messages in thread
From: наб @ 2022-09-14  2:39 UTC (permalink / raw)
  To: libc-alpha; +Cc: Florian Weimer

[-- Attachment #1: Type: text/plain, Size: 23828 bytes --]

This is a trivial patch, largely duplicating the extant ASCII code

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDF00> we land at the tail-end of the
Unicode Low Surrogate Area at DC00-DFFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
and match musl

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
new in v2: nothing
new in v3: POSIX charset, __wcsmbs_gconv_fcts_c comment fixed
new in v4: nothing

Resending after a week, per guideline.

 iconv/Makefile                    |   2 +-
 iconv/gconv_builtin.h             |   8 ++
 iconv/gconv_int.h                 |   7 ++
 iconv/gconv_posix.c               |  96 ++++++++++++++++++++
 iconv/tst-iconv_prog.sh           |  43 +++++++++
 iconvdata/tst-tables.sh           |   1 +
 inet/tst-idna_name_classify.c     |   6 +-
 locale/tst-C-locale.c             |  69 ++++++++++++++
 localedata/charmaps/POSIX         | 136 ++++++++++++++++++++++++++++
 localedata/locales/POSIX          | 143 +++++++++++++++++++++++++++++-
 stdio-common/tst-printf-bz25691.c |   2 +
 wcsmbs/wcsmbsload.c               |  14 +--
 12 files changed, 516 insertions(+), 11 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/iconv/Makefile b/iconv/Makefile
index a0d90cfeac..6e926f53e3 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 68c2369b1f..cd1805b3ce 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 1c6745043e..45ab1edfad 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -299,6 +301,11 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c);
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..dcb13fbb43
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,96 @@
+/* Simple transformations functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdf00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	if (__glibc_unlikely (val > 0x7f))				      \
+	  val -= 0xdf00;						      \
+	*outptr++ = val;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index b3d8bf5110..a24d8d2207 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdf\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index 4207b44175..33a02158ac 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bfd34eee31..b379481844 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index 6bd0367069..f30396ae12 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -229,6 +229,75 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+#define CONVTEST(b, v) \
+  {									      \
+    unsigned char bs[] = {b, 0};					      \
+    mbstate_t ctx = {};							      \
+    wchar_t wc = -1;							      \
+    size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);			      \
+    if (sz != !!b)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) width in locale %s wrong "		      \
+		"(is %zd, should be %d)\n", *bs, locname, sz, !!b);	      \
+	result = 1;							      \
+      }									      \
+    if (wc != v)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) value in locale %s wrong "		      \
+		"(is %x, should be %x)\n", *bs, locname, wc, v);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    CONVTEST(i, i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    CONVTEST(i, 0xdf00 + i);
+
+#define DECONVTEST(v, b) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != 1)							      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be 1)\n", v, locname, sz);		      \
+	result = 1;							      \
+      }									      \
+    if (ob != b)							      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be %hhx)\n", v, locname, ob, b);	      \
+	result = 1;							      \
+      }									      \
+  }
+#define DECONVERR(v) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != (size_t) -1)						      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be (size_t )-1)\n", v, locname, sz);	      \
+	result = 1;							      \
+      }									      \
+    if (ob != (unsigned char) -1)					      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be unchanged)\n", v, locname, ob);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    DECONVTEST(i, i);
+  for(int i = 0x80; i < 0xdf00; ++i)
+    DECONVERR(i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    DECONVTEST(0xdf00 + i, i);
+  for(int i = 0xe000; i <= 0xffff; ++i)
+    DECONVERR(i);
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..c44007ff49
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDF80>..<UDFFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..fc34a6abc1 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the end of the Low Surrogate Area to contain these,
+% yielding [<UDF80>, <UDFFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDF80>
+<UDF81>
+<UDF82>
+<UDF83>
+<UDF84>
+<UDF85>
+<UDF86>
+<UDF87>
+<UDF88>
+<UDF89>
+<UDF8A>
+<UDF8B>
+<UDF8C>
+<UDF8D>
+<UDF8E>
+<UDF8F>
+<UDF90>
+<UDF91>
+<UDF92>
+<UDF93>
+<UDF94>
+<UDF95>
+<UDF96>
+<UDF97>
+<UDF98>
+<UDF99>
+<UDF9A>
+<UDF9B>
+<UDF9C>
+<UDF9D>
+<UDF9E>
+<UDF9F>
+<UDFA0>
+<UDFA1>
+<UDFA2>
+<UDFA3>
+<UDFA4>
+<UDFA5>
+<UDFA6>
+<UDFA7>
+<UDFA8>
+<UDFA9>
+<UDFAA>
+<UDFAB>
+<UDFAC>
+<UDFAD>
+<UDFAE>
+<UDFAF>
+<UDFB0>
+<UDFB1>
+<UDFB2>
+<UDFB3>
+<UDFB4>
+<UDFB5>
+<UDFB6>
+<UDFB7>
+<UDFB8>
+<UDFB9>
+<UDFBA>
+<UDFBB>
+<UDFBC>
+<UDFBD>
+<UDFBE>
+<UDFBF>
+<UDFC0>
+<UDFC1>
+<UDFC2>
+<UDFC3>
+<UDFC4>
+<UDFC5>
+<UDFC6>
+<UDFC7>
+<UDFC8>
+<UDFC9>
+<UDFCA>
+<UDFCB>
+<UDFCC>
+<UDFCD>
+<UDFCE>
+<UDFCF>
+<UDFD0>
+<UDFD1>
+<UDFD2>
+<UDFD3>
+<UDFD4>
+<UDFD5>
+<UDFD6>
+<UDFD7>
+<UDFD8>
+<UDFD9>
+<UDFDA>
+<UDFDB>
+<UDFDC>
+<UDFDD>
+<UDFDE>
+<UDFDF>
+<UDFE0>
+<UDFE1>
+<UDFE2>
+<UDFE3>
+<UDFE4>
+<UDFE5>
+<UDFE6>
+<UDFE7>
+<UDFE8>
+<UDFE9>
+<UDFEA>
+<UDFEB>
+<UDFEC>
+<UDFED>
+<UDFEE>
+<UDFEF>
+<UDFF0>
+<UDFF1>
+<UDFF2>
+<UDFF3>
+<UDFF4>
+<UDFF5>
+<UDFF6>
+<UDFF7>
+<UDFF8>
+<UDFF9>
+<UDFFA>
+<UDFFB>
+<UDFFC>
+<UDFFD>
+<UDFFE>
+<UDFFF>
 order_end
 %
 END LC_COLLATE
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index 44844e71c3..e66242b58f 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 0f0f55f9ed..97de9afd25 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDF00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v5 1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format
  2022-09-14  2:39       ` [PATCH v4 " наб
@ 2022-09-21 14:01         ` наб
  2022-11-02 17:17           ` [PATCH v6 " наб
  2022-11-02 17:17           ` [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511] наб
  2022-09-21 14:01         ` [PATCH v5 " наб
  1 sibling, 2 replies; 29+ messages in thread
From: наб @ 2022-09-21 14:01 UTC (permalink / raw)
  To: libc-alpha; +Cc: Florian Weimer

[-- Attachment #1: Type: text/plain, Size: 3201 bytes --]

This "Old POSIX/DKUUG borrowed format" handling is original to the file
and doesn't seem to have ever been used, i.e. id/t-t-c doesn't seem to
have ever been called with argv[1] == POSIX

Upcoming is a POSIX charmap, which would inadvertently trigger this:
clear the way

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
new in v4: nothing
new in v5: nothing

Resending after a week, per guideline.

 iconvdata/gb18030.c            |  2 +-
 iconvdata/tst-table-charmap.sh | 11 +----------
 iconvdata/tst-table.sh         |  2 +-
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/iconvdata/gb18030.c b/iconvdata/gb18030.c
index ab9373cd12..bb93b55361 100644
--- a/iconvdata/gb18030.c
+++ b/iconvdata/gb18030.c
@@ -32,7 +32,7 @@
 /* The tables in this file are generated from the charmap.
    In a first step, the charmap is converted to a simple table format:
 
-      ./tst-table-charmap.sh GB18030 < ../localedata/charmaps/GB18030 \
+      ./tst-table-charmap.sh < ../localedata/charmaps/GB18030 \
       > GB18030.table
 */
 
diff --git a/iconvdata/tst-table-charmap.sh b/iconvdata/tst-table-charmap.sh
index 0e5369aa38..36959a02dd 100755
--- a/iconvdata/tst-table-charmap.sh
+++ b/iconvdata/tst-table-charmap.sh
@@ -22,13 +22,4 @@
 LC_ALL=C
 export LC_ALL
 
-case "$1" in
-  POSIX )
-    # Old POSIX/DKUUG borrowed format
-    grep '^<.*>.*/x[0-9A-Fa-f]*[ 	]*<U....>.*$' | grep -v 'not a real character' | sed -e 's,^<.*>[ 	]*\([/x0-9A-Fa-f]*\)[ 	]*<U\(....\)>.*$,\1	0x\2,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
-    ;;
-  *)
-    # New Unicode based format
-    sed -e 's,^%IRREVERSIBLE%,,' | grep '^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*>[ 	]*/x' | grep -v 'not a real character' | sed -e 's,<U\(....\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' -e 's,<U0*\([1-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
-    ;;
-esac
+sed -e 's,^%IRREVERSIBLE%,,' | grep '^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*>[ 	]*/x' | grep -v 'not a real character' | sed -e 's,<U\(....\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' -e 's,<U0*\([1-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
diff --git a/iconvdata/tst-table.sh b/iconvdata/tst-table.sh
index f63ab1d8ee..36005c5448 100755
--- a/iconvdata/tst-table.sh
+++ b/iconvdata/tst-table.sh
@@ -33,7 +33,7 @@ export LC_ALL
 set -e
 
 # Get the charmap.
-./tst-table-charmap.sh ${charmap:-$charset} \
+./tst-table-charmap.sh \
   < ../localedata/charmaps/${charmap:-$charset} \
   > ${objpfx}tst-${charset}.charmap.table
 # When the charset is GB18030, truncate this table because for this encoding,
-- 
2.30.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v5 2/2] POSIX locale covers every byte [BZ# 29511]
  2022-09-14  2:39       ` [PATCH v4 " наб
  2022-09-21 14:01         ` [PATCH v5 " наб
@ 2022-09-21 14:01         ` наб
  1 sibling, 0 replies; 29+ messages in thread
From: наб @ 2022-09-21 14:01 UTC (permalink / raw)
  To: libc-alpha; +Cc: Florian Weimer

[-- Attachment #1: Type: text/plain, Size: 23848 bytes --]

This is a trivial patch, largely duplicating the extant ASCII code

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDF00> we land at the tail-end of the
Unicode Low Surrogate Area at DC00-DFFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
and match musl

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
new in v2: nothing
new in v3: POSIX charset, __wcsmbs_gconv_fcts_c comment fixed
new in v4: nothing
new in v5: nothing

Resending after a week, per guideline.

 iconv/Makefile                    |   2 +-
 iconv/gconv_builtin.h             |   8 ++
 iconv/gconv_int.h                 |   7 ++
 iconv/gconv_posix.c               |  96 ++++++++++++++++++++
 iconv/tst-iconv_prog.sh           |  43 +++++++++
 iconvdata/tst-tables.sh           |   1 +
 inet/tst-idna_name_classify.c     |   6 +-
 locale/tst-C-locale.c             |  69 ++++++++++++++
 localedata/charmaps/POSIX         | 136 ++++++++++++++++++++++++++++
 localedata/locales/POSIX          | 143 +++++++++++++++++++++++++++++-
 stdio-common/tst-printf-bz25691.c |   2 +
 wcsmbs/wcsmbsload.c               |  14 +--
 12 files changed, 516 insertions(+), 11 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/iconv/Makefile b/iconv/Makefile
index a0d90cfeac..6e926f53e3 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 68c2369b1f..cd1805b3ce 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 1c6745043e..45ab1edfad 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -299,6 +301,11 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c);
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..dcb13fbb43
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,96 @@
+/* Simple transformations functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdf00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	if (__glibc_unlikely (val > 0x7f))				      \
+	  val -= 0xdf00;						      \
+	*outptr++ = val;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index b3d8bf5110..a24d8d2207 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdf\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index 4207b44175..33a02158ac 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bfd34eee31..b379481844 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index 6bd0367069..f30396ae12 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -229,6 +229,75 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+#define CONVTEST(b, v) \
+  {									      \
+    unsigned char bs[] = {b, 0};					      \
+    mbstate_t ctx = {};							      \
+    wchar_t wc = -1;							      \
+    size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);			      \
+    if (sz != !!b)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) width in locale %s wrong "		      \
+		"(is %zd, should be %d)\n", *bs, locname, sz, !!b);	      \
+	result = 1;							      \
+      }									      \
+    if (wc != v)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) value in locale %s wrong "		      \
+		"(is %x, should be %x)\n", *bs, locname, wc, v);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    CONVTEST(i, i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    CONVTEST(i, 0xdf00 + i);
+
+#define DECONVTEST(v, b) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != 1)							      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be 1)\n", v, locname, sz);		      \
+	result = 1;							      \
+      }									      \
+    if (ob != b)							      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be %hhx)\n", v, locname, ob, b);	      \
+	result = 1;							      \
+      }									      \
+  }
+#define DECONVERR(v) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != (size_t) -1)						      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be (size_t )-1)\n", v, locname, sz);	      \
+	result = 1;							      \
+      }									      \
+    if (ob != (unsigned char) -1)					      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be unchanged)\n", v, locname, ob);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    DECONVTEST(i, i);
+  for(int i = 0x80; i < 0xdf00; ++i)
+    DECONVERR(i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    DECONVTEST(0xdf00 + i, i);
+  for(int i = 0xe000; i <= 0xffff; ++i)
+    DECONVERR(i);
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..c44007ff49
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDF80>..<UDFFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..fc34a6abc1 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the end of the Low Surrogate Area to contain these,
+% yielding [<UDF80>, <UDFFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDF80>
+<UDF81>
+<UDF82>
+<UDF83>
+<UDF84>
+<UDF85>
+<UDF86>
+<UDF87>
+<UDF88>
+<UDF89>
+<UDF8A>
+<UDF8B>
+<UDF8C>
+<UDF8D>
+<UDF8E>
+<UDF8F>
+<UDF90>
+<UDF91>
+<UDF92>
+<UDF93>
+<UDF94>
+<UDF95>
+<UDF96>
+<UDF97>
+<UDF98>
+<UDF99>
+<UDF9A>
+<UDF9B>
+<UDF9C>
+<UDF9D>
+<UDF9E>
+<UDF9F>
+<UDFA0>
+<UDFA1>
+<UDFA2>
+<UDFA3>
+<UDFA4>
+<UDFA5>
+<UDFA6>
+<UDFA7>
+<UDFA8>
+<UDFA9>
+<UDFAA>
+<UDFAB>
+<UDFAC>
+<UDFAD>
+<UDFAE>
+<UDFAF>
+<UDFB0>
+<UDFB1>
+<UDFB2>
+<UDFB3>
+<UDFB4>
+<UDFB5>
+<UDFB6>
+<UDFB7>
+<UDFB8>
+<UDFB9>
+<UDFBA>
+<UDFBB>
+<UDFBC>
+<UDFBD>
+<UDFBE>
+<UDFBF>
+<UDFC0>
+<UDFC1>
+<UDFC2>
+<UDFC3>
+<UDFC4>
+<UDFC5>
+<UDFC6>
+<UDFC7>
+<UDFC8>
+<UDFC9>
+<UDFCA>
+<UDFCB>
+<UDFCC>
+<UDFCD>
+<UDFCE>
+<UDFCF>
+<UDFD0>
+<UDFD1>
+<UDFD2>
+<UDFD3>
+<UDFD4>
+<UDFD5>
+<UDFD6>
+<UDFD7>
+<UDFD8>
+<UDFD9>
+<UDFDA>
+<UDFDB>
+<UDFDC>
+<UDFDD>
+<UDFDE>
+<UDFDF>
+<UDFE0>
+<UDFE1>
+<UDFE2>
+<UDFE3>
+<UDFE4>
+<UDFE5>
+<UDFE6>
+<UDFE7>
+<UDFE8>
+<UDFE9>
+<UDFEA>
+<UDFEB>
+<UDFEC>
+<UDFED>
+<UDFEE>
+<UDFEF>
+<UDFF0>
+<UDFF1>
+<UDFF2>
+<UDFF3>
+<UDFF4>
+<UDFF5>
+<UDFF6>
+<UDFF7>
+<UDFF8>
+<UDFF9>
+<UDFFA>
+<UDFFB>
+<UDFFC>
+<UDFFD>
+<UDFFE>
+<UDFFF>
 order_end
 %
 END LC_COLLATE
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index 44844e71c3..e66242b58f 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 0f0f55f9ed..97de9afd25 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDF00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v6 1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format
  2022-09-21 14:01         ` [PATCH v5 " наб
@ 2022-11-02 17:17           ` наб
  2022-11-09 12:49             ` Florian Weimer
  2022-11-02 17:17           ` [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511] наб
  1 sibling, 1 reply; 29+ messages in thread
From: наб @ 2022-11-02 17:17 UTC (permalink / raw)
  To: libc-alpha; +Cc: Florian Weimer

[-- Attachment #1: Type: text/plain, Size: 3184 bytes --]

This "Old POSIX/DKUUG borrowed format" handling is original to the file
and doesn't seem to have ever been used, i.e. id/t-t-c doesn't seem to
have ever been called with argv[1] == POSIX

Upcoming is a POSIX charmap, which would inadvertently trigger this:
clear the way

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
new in v4: nothing
new in v5: nothing
new in v6: clean rebase

 iconvdata/gb18030.c            |  2 +-
 iconvdata/tst-table-charmap.sh | 11 +----------
 iconvdata/tst-table.sh         |  2 +-
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/iconvdata/gb18030.c b/iconvdata/gb18030.c
index ab9373cd12..bb93b55361 100644
--- a/iconvdata/gb18030.c
+++ b/iconvdata/gb18030.c
@@ -32,7 +32,7 @@
 /* The tables in this file are generated from the charmap.
    In a first step, the charmap is converted to a simple table format:
 
-      ./tst-table-charmap.sh GB18030 < ../localedata/charmaps/GB18030 \
+      ./tst-table-charmap.sh < ../localedata/charmaps/GB18030 \
       > GB18030.table
 */
 
diff --git a/iconvdata/tst-table-charmap.sh b/iconvdata/tst-table-charmap.sh
index 0e5369aa38..36959a02dd 100755
--- a/iconvdata/tst-table-charmap.sh
+++ b/iconvdata/tst-table-charmap.sh
@@ -22,13 +22,4 @@
 LC_ALL=C
 export LC_ALL
 
-case "$1" in
-  POSIX )
-    # Old POSIX/DKUUG borrowed format
-    grep '^<.*>.*/x[0-9A-Fa-f]*[ 	]*<U....>.*$' | grep -v 'not a real character' | sed -e 's,^<.*>[ 	]*\([/x0-9A-Fa-f]*\)[ 	]*<U\(....\)>.*$,\1	0x\2,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
-    ;;
-  *)
-    # New Unicode based format
-    sed -e 's,^%IRREVERSIBLE%,,' | grep '^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*>[ 	]*/x' | grep -v 'not a real character' | sed -e 's,<U\(....\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' -e 's,<U0*\([1-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
-    ;;
-esac
+sed -e 's,^%IRREVERSIBLE%,,' | grep '^<U[0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*>[ 	]*/x' | grep -v 'not a real character' | sed -e 's,<U\(....\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' -e 's,<U0*\([1-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F][0-9A-F]*\)>[ 	]*\([/x0-9A-Fa-f]*\).*$,\2	0x\1,' | tr abcdef ABCDEF | sed -e 's,/x\([0-9A-F][0-9A-F]\),\1,g' | sed -e 's,^,0x,' | sort | uniq | grep -v '^0x00	0x\([1-9A-F]...\|.[1-9A-F]..\|..[1-9A-F].\|...[1-9A-F]\)'
diff --git a/iconvdata/tst-table.sh b/iconvdata/tst-table.sh
index f63ab1d8ee..36005c5448 100755
--- a/iconvdata/tst-table.sh
+++ b/iconvdata/tst-table.sh
@@ -33,7 +33,7 @@ export LC_ALL
 set -e
 
 # Get the charmap.
-./tst-table-charmap.sh ${charmap:-$charset} \
+./tst-table-charmap.sh \
   < ../localedata/charmaps/${charmap:-$charset} \
   > ${objpfx}tst-${charset}.charmap.table
 # When the charset is GB18030, truncate this table because for this encoding,
-- 
2.30.2


[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511]
  2022-09-21 14:01         ` [PATCH v5 " наб
  2022-11-02 17:17           ` [PATCH v6 " наб
@ 2022-11-02 17:17           ` наб
  2022-11-09 14:20             ` Florian Weimer
  1 sibling, 1 reply; 29+ messages in thread
From: наб @ 2022-11-02 17:17 UTC (permalink / raw)
  To: libc-alpha; +Cc: Florian Weimer

[-- Attachment #1: Type: text/plain, Size: 23891 bytes --]

This is a logistically trivial patch,
largely duplicating the extant ASCII code with the error path changed

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDF00> we land at the tail-end of the
Unicode Low Surrogate Area at DC00-DFFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
and match musl

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
new in v2: nothing
new in v3: POSIX charset, __wcsmbs_gconv_fcts_c comment fixed
new in v4: nothing
new in v5: nothing
new in v6: clean rebase, rephrase message

 iconv/Makefile                    |   2 +-
 iconv/gconv_builtin.h             |   8 ++
 iconv/gconv_int.h                 |   7 ++
 iconv/gconv_posix.c               |  96 ++++++++++++++++++++
 iconv/tst-iconv_prog.sh           |  43 +++++++++
 iconvdata/tst-tables.sh           |   1 +
 inet/tst-idna_name_classify.c     |   6 +-
 locale/tst-C-locale.c             |  69 ++++++++++++++
 localedata/charmaps/POSIX         | 136 ++++++++++++++++++++++++++++
 localedata/locales/POSIX          | 143 +++++++++++++++++++++++++++++-
 stdio-common/tst-printf-bz25691.c |   2 +
 wcsmbs/wcsmbsload.c               |  14 +--
 12 files changed, 516 insertions(+), 11 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/iconv/Makefile b/iconv/Makefile
index a0d90cfeac..6e926f53e3 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 68c2369b1f..cd1805b3ce 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 1c6745043e..45ab1edfad 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -299,6 +301,11 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c);
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..dcb13fbb43
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,96 @@
+/* Simple transformations functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdf00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	if (__glibc_unlikely (val > 0x7f))				      \
+	  val -= 0xdf00;						      \
+	*outptr++ = val;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index b3d8bf5110..a24d8d2207 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdf\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index 4207b44175..33a02158ac 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bfd34eee31..b379481844 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index 6bd0367069..f30396ae12 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -229,6 +229,75 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+#define CONVTEST(b, v) \
+  {									      \
+    unsigned char bs[] = {b, 0};					      \
+    mbstate_t ctx = {};							      \
+    wchar_t wc = -1;							      \
+    size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);			      \
+    if (sz != !!b)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) width in locale %s wrong "		      \
+		"(is %zd, should be %d)\n", *bs, locname, sz, !!b);	      \
+	result = 1;							      \
+      }									      \
+    if (wc != v)							      \
+      {									      \
+	printf ("mbrtowc(%02hhx) value in locale %s wrong "		      \
+		"(is %x, should be %x)\n", *bs, locname, wc, v);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    CONVTEST(i, i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    CONVTEST(i, 0xdf00 + i);
+
+#define DECONVTEST(v, b) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != 1)							      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be 1)\n", v, locname, sz);		      \
+	result = 1;							      \
+      }									      \
+    if (ob != b)							      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be %hhx)\n", v, locname, ob, b);	      \
+	result = 1;							      \
+      }									      \
+  }
+#define DECONVERR(v) \
+  {									      \
+    unsigned char ob = -1;						      \
+    mbstate_t ctx = {};							      \
+    size_t sz = wcrtomb((char *) &ob, v, &ctx);				      \
+    if (sz != (size_t) -1)						      \
+      {									      \
+	printf ("wcrtomb(%x) width in locale %s wrong "			      \
+		"(is %zd, should be (size_t )-1)\n", v, locname, sz);	      \
+	result = 1;							      \
+      }									      \
+    if (ob != (unsigned char) -1)					      \
+      {									      \
+	printf ("wcrtomb(%x) value in locale %s wrong "			      \
+		"(is %hhx, should be unchanged)\n", v, locname, ob);	      \
+	result = 1;							      \
+      }									      \
+  }
+  for(int i = 0; i <= 0x7f; ++i)
+    DECONVTEST(i, i);
+  for(int i = 0x80; i < 0xdf00; ++i)
+    DECONVERR(i);
+  for(int i = 0x80; i <= 0xff; ++i)
+    DECONVTEST(0xdf00 + i, i);
+  for(int i = 0xe000; i <= 0xffff; ++i)
+    DECONVERR(i);
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..c44007ff49
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDF80>..<UDFFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..fc34a6abc1 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the end of the Low Surrogate Area to contain these,
+% yielding [<UDF80>, <UDFFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDF80>
+<UDF81>
+<UDF82>
+<UDF83>
+<UDF84>
+<UDF85>
+<UDF86>
+<UDF87>
+<UDF88>
+<UDF89>
+<UDF8A>
+<UDF8B>
+<UDF8C>
+<UDF8D>
+<UDF8E>
+<UDF8F>
+<UDF90>
+<UDF91>
+<UDF92>
+<UDF93>
+<UDF94>
+<UDF95>
+<UDF96>
+<UDF97>
+<UDF98>
+<UDF99>
+<UDF9A>
+<UDF9B>
+<UDF9C>
+<UDF9D>
+<UDF9E>
+<UDF9F>
+<UDFA0>
+<UDFA1>
+<UDFA2>
+<UDFA3>
+<UDFA4>
+<UDFA5>
+<UDFA6>
+<UDFA7>
+<UDFA8>
+<UDFA9>
+<UDFAA>
+<UDFAB>
+<UDFAC>
+<UDFAD>
+<UDFAE>
+<UDFAF>
+<UDFB0>
+<UDFB1>
+<UDFB2>
+<UDFB3>
+<UDFB4>
+<UDFB5>
+<UDFB6>
+<UDFB7>
+<UDFB8>
+<UDFB9>
+<UDFBA>
+<UDFBB>
+<UDFBC>
+<UDFBD>
+<UDFBE>
+<UDFBF>
+<UDFC0>
+<UDFC1>
+<UDFC2>
+<UDFC3>
+<UDFC4>
+<UDFC5>
+<UDFC6>
+<UDFC7>
+<UDFC8>
+<UDFC9>
+<UDFCA>
+<UDFCB>
+<UDFCC>
+<UDFCD>
+<UDFCE>
+<UDFCF>
+<UDFD0>
+<UDFD1>
+<UDFD2>
+<UDFD3>
+<UDFD4>
+<UDFD5>
+<UDFD6>
+<UDFD7>
+<UDFD8>
+<UDFD9>
+<UDFDA>
+<UDFDB>
+<UDFDC>
+<UDFDD>
+<UDFDE>
+<UDFDF>
+<UDFE0>
+<UDFE1>
+<UDFE2>
+<UDFE3>
+<UDFE4>
+<UDFE5>
+<UDFE6>
+<UDFE7>
+<UDFE8>
+<UDFE9>
+<UDFEA>
+<UDFEB>
+<UDFEC>
+<UDFED>
+<UDFEE>
+<UDFEF>
+<UDFF0>
+<UDFF1>
+<UDFF2>
+<UDFF3>
+<UDFF4>
+<UDFF5>
+<UDFF6>
+<UDFF7>
+<UDFF8>
+<UDFF9>
+<UDFFA>
+<UDFFB>
+<UDFFC>
+<UDFFD>
+<UDFFE>
+<UDFFF>
 order_end
 %
 END LC_COLLATE
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index 44844e71c3..e66242b58f 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 0f0f55f9ed..97de9afd25 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDF00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v6 1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format
  2022-11-02 17:17           ` [PATCH v6 " наб
@ 2022-11-09 12:49             ` Florian Weimer
  0 siblings, 0 replies; 29+ messages in thread
From: Florian Weimer @ 2022-11-09 12:49 UTC (permalink / raw)
  To: наб; +Cc: libc-alpha

* наб:

> This "Old POSIX/DKUUG borrowed format" handling is original to the file
> and doesn't seem to have ever been used, i.e. id/t-t-c doesn't seem to
> have ever been called with argv[1] == POSIX
>
> Upcoming is a POSIX charmap, which would inadvertently trigger this:
> clear the way
>
> Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>

Reviewed-by: Florian Weimer <fweimer@redhat.com>

I've pushed this with some minor commit message edits.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511]
  2022-11-02 17:17           ` [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511] наб
@ 2022-11-09 14:20             ` Florian Weimer
  2022-11-09 16:14               ` [PATCH v7] " наб
  2022-11-10  8:10               ` [PATCH v6 2/2] " Florian Weimer
  0 siblings, 2 replies; 29+ messages in thread
From: Florian Weimer @ 2022-11-09 14:20 UTC (permalink / raw)
  To: наб; +Cc: libc-alpha, Victor Stinner

* наб:

> This is a logistically trivial patch,
> largely duplicating the extant ASCII code with the error path changed

I wouldn't say it's trivial in the commit message. 8-)

> There are two user-facing changes:
>   * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
>   * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b
>
> Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
>   (a) is 1-byte, stateless, and contains 256 characters
>   (b) they collate in byte order
>   (c) the first 128 characters are equivalent to ASCII (like previous)
> cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
> changes to the standard;
> in short, this means that mbrtowc() must never fail and must return
>   b if b <= 0x7F else ab+c for all bytes b
>   where c is some constant >=0x80
>     and a is a positive integer constant
>
> By strategically picking c=<UDF00> we land at the tail-end of the
> Unicode Low Surrogate Area at DC00-DFFF, described as
>   > Isolated surrogate code points have no interpretation;
>   > consequently, no character code charts or names lists
>   > are provided for this range.
> and match musl

Sadly this doesn't match Python and PEP 540:

>>> b'\x80'.decode('UTF-8', errors='surrogateescape')
'\udc80'

I believe the implementation translates this to 0xDF80 instead.

Not sure what is more important here, musl compatibility or Python
compatibility.  Cc:ing Victor in case he as comments.  I should probably
ask on the musl list as well as how this divergence came to pass.

This change definitely needs a NEWS entry.

The mechanics of the patch look okay to me, just a few nits below.

> diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
> index 1c6745043e..45ab1edfad 100644
> --- a/iconv/gconv_int.h
> +++ b/iconv/gconv_int.h
> @@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
>  
>  __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
>  __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
> +__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
> +__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
>  __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
>  __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
>  __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
> @@ -299,6 +301,11 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
>     only ASCII characters.  */
>  extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
>  
> +/* Specialized conversion function for a single byte to INTERNAL,
> +   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
> +   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
> +extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c);
> +
>  #endif

Missing attribute_hidden.  Yes, it's also missing from
__gconv_btwoc_ascii.  The linker probably papers over it.

>  
>  __END_DECLS
> diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
> new file mode 100644
> index 0000000000..dcb13fbb43
> --- /dev/null
> +++ b/iconv/gconv_posix.c
> @@ -0,0 +1,96 @@
> +/* Simple transformations functions.

I think this line should say something about surrogate-escape encoding
for the POSIX locale.

> +#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
> +#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
> +#define LOOPFCT			FROM_LOOP
> +#define BODY \
> +  {									      \
> +    uint32_t val = *((const uint32_t *) inptr);				      \
> +    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))      \
> +      {									      \
> +	UNICODE_TAG_HANDLER (val, 4);					      \
> +	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
> +      }									      \
> +    else								      \
> +      {									      \
> +	if (__glibc_unlikely (val > 0x7f))				      \
> +	  val -= 0xdf00;						      \
> +	*outptr++ = val;						      \
> +	inptr += sizeof (uint32_t);					      \
> +      }									      \
> +  }

I suggest to drop the last __glibc_unlikely here because it's
input-dependent.

> +#define LOOP_NEED_FLAGS
> +#include <iconv/loop.c>
> +#include <iconv/skeleton.c>
> diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
> index b3d8bf5110..a24d8d2207 100644
> --- a/iconv/tst-iconv_prog.sh
> +++ b/iconv/tst-iconv_prog.sh
> @@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
>    execute_test
>    check_errtest_result
>  done
> +
> +allbytes ()
> +{
> +  for (( i = 0; i <= 255; i++ )); do
> +    printf '\'"$(printf "%o" "$i")"
> +  done
> +}
> +
> +allucs4be ()
> +{
> +  for (( i = 0; i <= 127; i++ )); do
> +    printf '\0\0\0\'"$(printf "%o" "$i")"
> +  done
> +  for (( i = 128; i <= 255; i++ )); do
> +    printf '\0\0\xdf\'"$(printf "%o" "$i")"
> +  done
> +}
> +
> +check_posix_result ()
> +{
> +  if [ $? -eq 0 ]; then
> +    result=PASS
> +  else
> +    result=FAIL
> +  fi
> +
> +  echo "$result: from \"$1\", to: \"$2\""
> +
> +  if [ "$result" != "PASS" ]; then
> +    exit 1
> +  fi
> +}
> +
> +check_posix_encoding ()
> +{
> +  eval PROG=\"$ICONV\"
> +  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
> +  check_posix_result POSIX UCS-4BE
> +  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
> +  check_posix_result UCS-4BE POSIX
> +}
> +
> +check_posix_encoding
> diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
> index 4207b44175..33a02158ac 100755
> --- a/iconvdata/tst-tables.sh
> +++ b/iconvdata/tst-tables.sh
> @@ -31,6 +31,7 @@ cat <<EOF |
>    # Keep this list in the same order as gconv-modules.
>    #
>    # charset name    table name          comment
> +  POSIX
>    ASCII             ANSI_X3.4-1968
>    ISO646-GB         BS_4730
>    ISO646-CA         CSA_Z243.4-1985-1
> diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
> index bfd34eee31..b379481844 100644
> --- a/inet/tst-idna_name_classify.c
> +++ b/inet/tst-idna_name_classify.c
> @@ -37,11 +37,11 @@ do_test (void)
>    puts ("info: C locale tests");
>    locale_insensitive_tests ();
>    TEST_COMPARE (__idna_name_classify ("abc\200def"),
> -                idna_name_encoding_error);
> +                idna_name_nonascii);
>    TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
> -                idna_name_encoding_error);
> +                idna_name_nonascii_backslash);
>    TEST_COMPARE (__idna_name_classify ("abc\377def"),
> -                idna_name_encoding_error);
> +                idna_name_nonascii);
>  
>    puts ("info: en_US.ISO-8859-1 locale tests");
>    if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)

This seems to be okay, there is further test coverage for
idna_name_encoding_error.

> diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
> index 6bd0367069..f30396ae12 100644
> --- a/locale/tst-C-locale.c
> +++ b/locale/tst-C-locale.c
> @@ -229,6 +229,75 @@ run_test (const char *locname)
>    STRTEST (YESSTR, "");
>    STRTEST (NOSTR, "");
>  
> +#define CONVTEST(b, v) \
> +  {									      \
> +    unsigned char bs[] = {b, 0};					      \
> +    mbstate_t ctx = {};							      \
> +    wchar_t wc = -1;							      \
> +    size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);			      \

Missing space before '(' (also in other cases below).

Not sure if the macros are needed, maybe write one loop for each
direction with a condition in it?

> diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
> index 44844e71c3..e66242b58f 100644
> --- a/stdio-common/tst-printf-bz25691.c
> +++ b/stdio-common/tst-printf-bz25691.c
> @@ -30,6 +30,8 @@
>  static int
>  do_test (void)
>  {
> +  setlocale(LC_CTYPE, "C.UTF-8");
> +
>    mtrace ();
>  
>    /* For 's' conversion specifier with 'l' modifier the array must be

What's the rationale for this change?  If it is really required, you
must also update stdio-common/Makefile with a new dependency on
$(gen-locales).

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v7] POSIX locale covers every byte [BZ# 29511]
  2022-11-09 14:20             ` Florian Weimer
@ 2022-11-09 16:14               ` наб
  2022-11-10  9:52                 ` Florian Weimer
  2022-11-10  8:10               ` [PATCH v6 2/2] " Florian Weimer
  1 sibling, 1 reply; 29+ messages in thread
From: наб @ 2022-11-09 16:14 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, Victor Stinner

[-- Attachment #1: Type: text/plain, Size: 29411 bytes --]

Hi!

On Wed, Nov 09, 2022 at 03:20:26PM +0100, Florian Weimer wrote:
> * наб:
> > This is a logistically trivial patch,
> > largely duplicating the extant ASCII code with the error path changed
> I wouldn't say it's trivial in the commit message. 8-)
Trimmed to ~second line

> > There are two user-facing changes:
> >   * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
> >   * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b
> >
> > Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
> >   (a) is 1-byte, stateless, and contains 256 characters
> >   (b) they collate in byte order
> >   (c) the first 128 characters are equivalent to ASCII (like previous)
> > cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
> > changes to the standard;
> > in short, this means that mbrtowc() must never fail and must return
> >   b if b <= 0x7F else ab+c for all bytes b
> >   where c is some constant >=0x80
> >     and a is a positive integer constant
> >
> > By strategically picking c=<UDF00> we land at the tail-end of the
> > Unicode Low Surrogate Area at DC00-DFFF, described as
> >   > Isolated surrogate code points have no interpretation;
> >   > consequently, no character code charts or names lists
> >   > are provided for this range.
> > and match musl
> 
> Sadly this doesn't match Python and PEP 540:
> 
> >>> b'\x80'.decode('UTF-8', errors='surrogateescape')
> '\udc80'
> 
> I believe the implementation translates this to 0xDF80 instead.
Yes.

> Not sure what is more important here, musl compatibility or Python
> compatibility.  Cc:ing Victor in case he as comments.  I should probably
> ask on the musl list as well as how this divergence came to pass.
I went for musl because (a) it's a libc not some random programming
language, (b) putting the end of our domain at the end of the
surrogates is more aesthetically and ideologically pleasing, and (c)
there's marginal value of having both musl and glibc produce the same
characters if you like save them as integers for some reason.
But the choice of any range therein is pretty much editorial, I think.

> This change definitely needs a NEWS entry.
Something like this?
  Deprecated and removed features, and other changes affecting compatibility:
  * The default/"POSIX"/"C" locale's character set is now "POSIX",
    instead of "ANSI_X3.4-1968"  this is a new fully-reversible
    8-bit transparent encoding for compatibility with Issue 7 TC 2,
    identity-mapping bytes in the ASCII [0, 0x7F] range,
    and mapping [0x80, 0xFF] bytes to [<U+DF80>, <U+DFFF>].

> > diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
> > index 1c6745043e..45ab1edfad 100644
> > --- a/iconv/gconv_int.h
> > +++ b/iconv/gconv_int.h
> > @@ -299,6 +301,11 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
> >     only ASCII characters.  */
> >  extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
> >  
> > +/* Specialized conversion function for a single byte to INTERNAL,
> > +   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
> > +   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
> > +extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c);
> > +
> >  #endif
> 
> Missing attribute_hidden.  Yes, it's also missing from
> __gconv_btwoc_ascii.  The linker probably papers over it.
Added.

> > diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
> > new file mode 100644
> > index 0000000000..dcb13fbb43
> > --- /dev/null
> > +++ b/iconv/gconv_posix.c
> > @@ -0,0 +1,96 @@
> > +/* Simple transformations functions.
> 
> I think this line should say something about surrogate-escape encoding
> for the POSIX locale.
I completely missed that this line isn't part of the licence. Used
> "POSIX" locale transformation functions.
as the shorthand, this is expounded in the comment for __g_b_p() below.

> > +    else								      \
> > +      {									      \
> > +	if (__glibc_unlikely (val > 0x7f))				      \
> > +	  val -= 0xdf00;						      \
> > +	*outptr++ = val;						      \
> > +	inptr += sizeof (uint32_t);					      \
> > +      }									      \
> > +  }
> 
> I suggest to drop the last __glibc_unlikely here because it's
> input-dependent.
Applied.

> > diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
> > index 6bd0367069..f30396ae12 100644
> > --- a/locale/tst-C-locale.c
> > +++ b/locale/tst-C-locale.c
> > @@ -229,6 +229,75 @@ run_test (const char *locname)
> >    STRTEST (YESSTR, "");
> >    STRTEST (NOSTR, "");
> >  
> > +#define CONVTEST(b, v) \
> > +  {									      \
> > +    unsigned char bs[] = {b, 0};					      \
> > +    mbstate_t ctx = {};							      \
> > +    wchar_t wc = -1;							      \
> > +    size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);			      \
> 
> Missing space before '(' (also in other cases below).
> 
> Not sure if the macros are needed, maybe write one loop for each
> direction with a condition in it?
Fixed, unrolled.

> > diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
> > index 44844e71c3..e66242b58f 100644
> > --- a/stdio-common/tst-printf-bz25691.c
> > +++ b/stdio-common/tst-printf-bz25691.c
> > @@ -30,6 +30,8 @@
> >  static int
> >  do_test (void)
> >  {
> > +  setlocale(LC_CTYPE, "C.UTF-8");
> > +
> >    mtrace ();
> >  
> >    /* For 's' conversion specifier with 'l' modifier the array must be
> 
> What's the rationale for this change?  If it is really required, you
> must also update stdio-common/Makefile with a new dependency on
> $(gen-locales).
The test depends on the locale having a hole at 0xFF, cf. ll. 93-100:
    /* Same test, but with an invalid multibyte sequence.  */
    mbs[mbssize - 2] = 0xff;

    ret = swprintf (result, resultsize, L"%.65537s", mbs);
    TEST_COMPARE (ret, -1);

    ret = swprintf (result, resultsize, L"%1$.65537s", mbs);
    TEST_COMPARE (ret, -1);
And this is the simplest way to ensure that, I think.

Dependency added.

> Thanks,
> Florian
Best,
наб

Scissor-patch follows.
-- >8 --
This largely duplicates the ASCII code with the error path changed

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDF00> we land at the tail-end of the
Unicode Low Surrogate Area at DC00-DFFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
and match musl

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
 iconv/Makefile                    |   2 +-
 iconv/gconv_builtin.h             |   8 ++
 iconv/gconv_int.h                 |   8 ++
 iconv/gconv_posix.c               |  96 ++++++++++++++++++++
 iconv/tst-iconv_prog.sh           |  43 +++++++++
 iconvdata/tst-tables.sh           |   1 +
 inet/tst-idna_name_classify.c     |   6 +-
 locale/tst-C-locale.c             |  44 +++++++++
 localedata/charmaps/POSIX         | 136 ++++++++++++++++++++++++++++
 localedata/locales/POSIX          | 143 +++++++++++++++++++++++++++++-
 stdio-common/Makefile             |   1 +
 stdio-common/tst-printf-bz25691.c |   2 +
 wcsmbs/wcsmbsload.c               |  14 +--
 13 files changed, 493 insertions(+), 11 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/iconv/Makefile b/iconv/Makefile
index a0d90cfeac..6e926f53e3 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 68c2369b1f..cd1805b3ce 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 1c6745043e..2f21a7389d 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -299,6 +301,12 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+     attribute_hidden;
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..412d910109
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,96 @@
+/* "POSIX" locale transformation functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdf00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	if (val > 0x7f)							      \
+	  val -= 0xdf00;						      \
+	*outptr++ = val;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index b3d8bf5110..a24d8d2207 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdf\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index 4207b44175..33a02158ac 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bfd34eee31..b379481844 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index 6bd0367069..0f03d69003 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -20,6 +20,7 @@
 #include <langinfo.h>
 #include <limits.h>
 #include <locale.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include <wchar.h>
@@ -229,6 +230,49 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+  for(int i = 0; i <= 0xff; ++i)
+    {
+      unsigned char bs[] = {i, 0};
+      mbstate_t ctx = {};
+      wchar_t wc = -1, exp = i <= 0x7f ? i : (0xdf00 + i);
+      size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);
+      if (sz != !!i)
+	{
+	  printf ("mbrtowc(%02hhx) width in locale %s wrong "
+		  "(is %zd, should be %d)\n", *bs, locname, sz, !!i);
+	  result = 1;
+	}
+      if (wc != exp)
+	{
+	  printf ("mbrtowc(%02hhx) value in locale %s wrong "
+		  "(is %x, should be %x)\n", *bs, locname, wc, exp);
+	  result = 1;
+	}
+    }
+
+  for (int i = 0; i <= 0xffff; ++i)
+    {
+      bool expok = (i <= 0x7f) || (i >= 0xdf80 && i <= 0xdfff);
+      size_t expsz = expok ? 1 : (size_t) -1;
+      unsigned char expob = expok ? (i & 0xff) : (unsigned char) -1;
+
+      unsigned char ob = -1;
+      mbstate_t ctx = {};
+      size_t sz = wcrtomb ((char *) &ob, i, &ctx);
+      if (sz != expsz)
+	{
+	  printf ("wcrtomb(%x) width in locale %s wrong "
+		  "(is %zd, should be %zd)\n", i, locname, sz, expsz);
+	  result = 1;
+	}
+      if (ob != expob)
+	{
+	  printf ("wcrtomb(%x) value in locale %s wrong "
+		  "(is %hhx, should be %hhx)\n", i, locname, ob, expob);
+	  result = 1;
+	}
+    }
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..c44007ff49
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDF80>..<UDFFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..fc34a6abc1 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the end of the Low Surrogate Area to contain these,
+% yielding [<UDF80>, <UDFFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDF80>
+<UDF81>
+<UDF82>
+<UDF83>
+<UDF84>
+<UDF85>
+<UDF86>
+<UDF87>
+<UDF88>
+<UDF89>
+<UDF8A>
+<UDF8B>
+<UDF8C>
+<UDF8D>
+<UDF8E>
+<UDF8F>
+<UDF90>
+<UDF91>
+<UDF92>
+<UDF93>
+<UDF94>
+<UDF95>
+<UDF96>
+<UDF97>
+<UDF98>
+<UDF99>
+<UDF9A>
+<UDF9B>
+<UDF9C>
+<UDF9D>
+<UDF9E>
+<UDF9F>
+<UDFA0>
+<UDFA1>
+<UDFA2>
+<UDFA3>
+<UDFA4>
+<UDFA5>
+<UDFA6>
+<UDFA7>
+<UDFA8>
+<UDFA9>
+<UDFAA>
+<UDFAB>
+<UDFAC>
+<UDFAD>
+<UDFAE>
+<UDFAF>
+<UDFB0>
+<UDFB1>
+<UDFB2>
+<UDFB3>
+<UDFB4>
+<UDFB5>
+<UDFB6>
+<UDFB7>
+<UDFB8>
+<UDFB9>
+<UDFBA>
+<UDFBB>
+<UDFBC>
+<UDFBD>
+<UDFBE>
+<UDFBF>
+<UDFC0>
+<UDFC1>
+<UDFC2>
+<UDFC3>
+<UDFC4>
+<UDFC5>
+<UDFC6>
+<UDFC7>
+<UDFC8>
+<UDFC9>
+<UDFCA>
+<UDFCB>
+<UDFCC>
+<UDFCD>
+<UDFCE>
+<UDFCF>
+<UDFD0>
+<UDFD1>
+<UDFD2>
+<UDFD3>
+<UDFD4>
+<UDFD5>
+<UDFD6>
+<UDFD7>
+<UDFD8>
+<UDFD9>
+<UDFDA>
+<UDFDB>
+<UDFDC>
+<UDFDD>
+<UDFDE>
+<UDFDF>
+<UDFE0>
+<UDFE1>
+<UDFE2>
+<UDFE3>
+<UDFE4>
+<UDFE5>
+<UDFE6>
+<UDFE7>
+<UDFE8>
+<UDFE9>
+<UDFEA>
+<UDFEB>
+<UDFEC>
+<UDFED>
+<UDFEE>
+<UDFEF>
+<UDFF0>
+<UDFF1>
+<UDFF2>
+<UDFF3>
+<UDFF4>
+<UDFF5>
+<UDFF6>
+<UDFF7>
+<UDFF8>
+<UDFF9>
+<UDFFA>
+<UDFFB>
+<UDFFC>
+<UDFFD>
+<UDFFE>
+<UDFFF>
 order_end
 %
 END LC_COLLATE
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 401dac69de..3cd6c3dc56 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -305,6 +305,7 @@ $(objpfx)scanf13.out: $(gen-locales)
 $(objpfx)test-vfprintf.out: $(gen-locales)
 $(objpfx)tst-grouping.out: $(gen-locales)
 $(objpfx)tst-grouping2.out: $(gen-locales)
+$(objpfx)tst-printf-bz25691-mem.out: $(gen-locales)
 $(objpfx)tst-sprintf.out: $(gen-locales)
 $(objpfx)tst-sscanf.out: $(gen-locales)
 $(objpfx)tst-swprintf.out: $(gen-locales)
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index 44844e71c3..e66242b58f 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 0f0f55f9ed..97de9afd25 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDF00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511]
  2022-11-09 14:20             ` Florian Weimer
  2022-11-09 16:14               ` [PATCH v7] " наб
@ 2022-11-10  8:10               ` Florian Weimer
  2022-11-28 16:24                 ` наб
  1 sibling, 1 reply; 29+ messages in thread
From: Florian Weimer @ 2022-11-10  8:10 UTC (permalink / raw)
  To: наб; +Cc: libc-alpha, Victor Stinner

* Florian Weimer:

> * наб:
>
>> This is a logistically trivial patch,
>> largely duplicating the extant ASCII code with the error path changed
>
> I wouldn't say it's trivial in the commit message. 8-)
>
>> There are two user-facing changes:
>>   * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
>>   * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b
>>
>> Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
>>   (a) is 1-byte, stateless, and contains 256 characters
>>   (b) they collate in byte order
>>   (c) the first 128 characters are equivalent to ASCII (like previous)
>> cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
>> changes to the standard;
>> in short, this means that mbrtowc() must never fail and must return
>>   b if b <= 0x7F else ab+c for all bytes b
>>   where c is some constant >=0x80
>>     and a is a positive integer constant
>>
>> By strategically picking c=<UDF00> we land at the tail-end of the
>> Unicode Low Surrogate Area at DC00-DFFF, described as
>>   > Isolated surrogate code points have no interpretation;
>>   > consequently, no character code charts or names lists
>>   > are provided for this range.
>> and match musl
>
> Sadly this doesn't match Python and PEP 540:
>
>>>> b'\x80'.decode('UTF-8', errors='surrogateescape')
> '\udc80'
>
> I believe the implementation translates this to 0xDF80 instead.
>
> Not sure what is more important here, musl compatibility or Python
> compatibility.  Cc:ing Victor in case he as comments.  I should probably
> ask on the musl list as well as how this divergence came to pass.

Raised on the musl list here:

  Choice of wchar_t mapping for non-ASCII bytes in the POSIX locale
  <https://www.openwall.com/lists/musl/2022/11/10/1>

> This change definitely needs a NEWS entry.

(With this I meant the change overall, not the encoding.)

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v7] POSIX locale covers every byte [BZ# 29511]
  2022-11-09 16:14               ` [PATCH v7] " наб
@ 2022-11-10  9:52                 ` Florian Weimer
  2023-01-09 15:17                   ` [PATCH v8] " наб
  0 siblings, 1 reply; 29+ messages in thread
From: Florian Weimer @ 2022-11-10  9:52 UTC (permalink / raw)
  To: наб; +Cc: libc-alpha, Victor Stinner

* наб:

>> Not sure what is more important here, musl compatibility or Python
>> compatibility.  Cc:ing Victor in case he as comments.  I should probably
>> ask on the musl list as well as how this divergence came to pass.

> I went for musl because (a) it's a libc not some random programming
> language, (b) putting the end of our domain at the end of the
> surrogates is more aesthetically and ideologically pleasing, and (c)
> there's marginal value of having both musl and glibc produce the same
> characters if you like save them as integers for some reason.
> But the choice of any range therein is pretty much editorial, I think.

Let's wait and see what the musl folks say.

>> This change definitely needs a NEWS entry.
> Something like this?
>   Deprecated and removed features, and other changes affecting compatibility:
>   * The default/"POSIX"/"C" locale's character set is now "POSIX",
>     instead of "ANSI_X3.4-1968"  this is a new fully-reversible
>     8-bit transparent encoding for compatibility with Issue 7 TC 2,

“POSIX Issue 7 TC 2”

>     identity-mapping bytes in the ASCII [0, 0x7F] range,
>     and mapping [0x80, 0xFF] bytes to [<U+DF80>, <U+DFFF>].

It should go into the major new features section, I think.

I would also say that POSIX no longer allows using UTF-8 for the C/POSIX
locale because the obvious question will be “why this custom encoding
and not UTF-8?”.  This new POSIX requirement is still a major
disappointment to me.

No need to repost for now.

>> > diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
>> > index 44844e71c3..e66242b58f 100644
>> > --- a/stdio-common/tst-printf-bz25691.c
>> > +++ b/stdio-common/tst-printf-bz25691.c
>> > @@ -30,6 +30,8 @@
>> >  static int
>> >  do_test (void)
>> >  {
>> > +  setlocale(LC_CTYPE, "C.UTF-8");
>> > +
>> >    mtrace ();
>> >  
>> >    /* For 's' conversion specifier with 'l' modifier the array must be
>> 
>> What's the rationale for this change?  If it is really required, you
>> must also update stdio-common/Makefile with a new dependency on
>> $(gen-locales).
> The test depends on the locale having a hole at 0xFF, cf. ll. 93-100:
>     /* Same test, but with an invalid multibyte sequence.  */
>     mbs[mbssize - 2] = 0xff;
>
>     ret = swprintf (result, resultsize, L"%.65537s", mbs);
>     TEST_COMPARE (ret, -1);
>
>     ret = swprintf (result, resultsize, L"%1$.65537s", mbs);
>     TEST_COMPARE (ret, -1);
> And this is the simplest way to ensure that, I think.
>
> Dependency added.

Right, makes sense.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511]
  2022-11-10  8:10               ` [PATCH v6 2/2] " Florian Weimer
@ 2022-11-28 16:24                 ` наб
  2022-12-02 17:36                   ` Florian Weimer
  0 siblings, 1 reply; 29+ messages in thread
From: наб @ 2022-11-28 16:24 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, Victor Stinner

[-- Attachment #1: Type: text/plain, Size: 478 bytes --]

18-day bump :)

On Thu, Nov 10, 2022 at 09:10:57AM +0100, Florian Weimer wrote:
> Raised on the musl list here:
>   Choice of wchar_t mapping for non-ASCII bytes in the POSIX locale
>   <https://www.openwall.com/lists/musl/2022/11/10/1>

That thread seems to've been exhausted (at least I don't see anything
fresh in the archive) ‒ should I just resend with the comments for v7
applied, or do you have a mapping range you'd rather see given those
givens?

наб

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511]
  2022-11-28 16:24                 ` наб
@ 2022-12-02 17:36                   ` Florian Weimer
  2022-12-02 18:42                     ` наб
  0 siblings, 1 reply; 29+ messages in thread
From: Florian Weimer @ 2022-12-02 17:36 UTC (permalink / raw)
  To: наб; +Cc: libc-alpha, Victor Stinner

* наб:

> 18-day bump :)
>
> On Thu, Nov 10, 2022 at 09:10:57AM +0100, Florian Weimer wrote:
>> Raised on the musl list here:
>>   Choice of wchar_t mapping for non-ASCII bytes in the POSIX locale
>>   <https://www.openwall.com/lists/musl/2022/11/10/1>
>
> That thread seems to've been exhausted (at least I don't see anything
> fresh in the archive) ‒ should I just resend with the comments for v7
> applied, or do you have a mapping range you'd rather see given those
> givens?

I still can't make up my mind.  I think the options are:

* Some sort of custom encoding (like you posted).
* Latin-1
* UTF-8 with surrogate escape encoding (and encouraging POSIX to change again)

What argues in favor of the last point is that many, many people are
using C.UTF-8 nowadays.  And effectively disabling wide/multibyte
conversion until you call setlocale does not seem particularly useful.

I have a feeling of déjà vu regarding this—I think I have investigated
non-setlocale defaults for wide/multibyte conversion in the past, but
can't find the previous discussion.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511]
  2022-12-02 17:36                   ` Florian Weimer
@ 2022-12-02 18:42                     ` наб
  0 siblings, 0 replies; 29+ messages in thread
From: наб @ 2022-12-02 18:42 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, Victor Stinner

[-- Attachment #1: Type: text/plain, Size: 3921 bytes --]

Hi!

On Fri, Dec 02, 2022 at 06:36:26PM +0100, Florian Weimer wrote:
> * наб:
> > On Thu, Nov 10, 2022 at 09:10:57AM +0100, Florian Weimer wrote:
> >> Raised on the musl list here:
> >>   Choice of wchar_t mapping for non-ASCII bytes in the POSIX locale
> >>   <https://www.openwall.com/lists/musl/2022/11/10/1>
> >
> > That thread seems to've been exhausted (at least I don't see anything
> > fresh in the archive) ‒ should I just resend with the comments for v7
> > applied, or do you have a mapping range you'd rather see given those
> > givens?
> 
> I still can't make up my mind.  I think the options are:
> 
> * Some sort of custom encoding (like you posted).
For which there's Prior Art in both other libcs and implementations
of similar mechanisms in unrelated software, making it just about what
users expect, and the lowest-energy conversion to what POSIX
has mandated for close to 8 years now.

> * Latin-1
Sorry, what? The Latin-1 that's so poorly defined W3C requires,
per spec, that Latin-1 charset specs be ignored? The Latin-1 they got
wrong so badly they made two subsequent standards, only one of which
compatible? The Latin-1 that has some random subset of germanic and
maybe like french if you squint and that's apparently fine? The iron
curtain has fallen, for better or for worse, since the 80s. If that's
the "solution", then leave "C" 7-bit. I'm gonna assume that's a joke.
(Also, people would then try to "use" it, and then (a) you've lost,
 but (b) the collation sequence is gonna be wrong always because
 it no longer represents any given language
 (though apparently it's "fine" if they collate in any random order,
  so it's legal per spec to make it just spanish, I think;
  this is somehow even worse, and I may be misunderstanding
  POSIX 7.3.2.6 because it'd mean that other parts of the standard
  that use and recommend "LC_ALL=C utility ..." to process bytes,
  like for sort, are also wrong?).)

> * UTF-8 with surrogate escape encoding (and encouraging POSIX to change again)
Well it's not gonna ‒ at least I don't think it is ‒ given that I don't
think it /changed/ anything actually? Issue 7-2008 7.2 says
> The tables in Locale Definition describe the characteristics and
> behavior of the POSIX locale for data consisting entirely of
> characters from the portable character set and the control character
> set. For other characters, the behavior is unspecified.

And just TC2 specified what had been unspecified behaviour I think?
Implementations had freedom to do whatever, including UTF-8, until 2016.
Naturally, as we're seeing now, not one has exercised that freedom.
If glibc /did/ do POSIX=C=C.UTF-8 before then,
then maybe we'd see a different result, but it hadn't, so we didn't.

> What argues in favor of the last point is that many, many people are
> using C.UTF-8 nowadays.
Great! They can continue to use C.UTF-8. They have had to opt in to
their preferred encoding like everyone else, and they will continue.
0 changes observed here.

> And effectively disabling wide/multibyte
> conversion until you call setlocale does not seem particularly useful.
"Mangling input data until explicitly disabled" is worse than
"input data is data, and you can make it characters".
Don't take me for not-a-UTF-8-maximalist, but, y'know,
it will never, unfortunately, be all I see,
and being able to completely opt out of additional input processing
would be nice; we're kinda close now with the current hard-7-bit ASCII,
and making it, essentially, I Can't Believe It's Not Just Bytes!,
per pt. 1, would eliminate even more head-aches IME.

Putting pro-verbial KOI-8 or [your grandma's favourite encoding] through
the UTF-8 grinder is much worse than just degrading to strcmp(),
but at this point I think I'm rambling, and spilled enough ink;
your call to make, at the end of the day.

наб

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v8] POSIX locale covers every byte [BZ# 29511]
  2022-11-10  9:52                 ` Florian Weimer
@ 2023-01-09 15:17                   ` наб
  2023-02-07 14:16                     ` [PATCH v9] " наб
  0 siblings, 1 reply; 29+ messages in thread
From: наб @ 2023-01-09 15:17 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, Victor Stinner

[-- Attachment #1: Type: text/plain, Size: 24243 bytes --]

This largely duplicates the ASCII code with the error path changed

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDF00> we land at the tail-end of the
Unicode Low Surrogate Area at DC00-DFFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
and match musl

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
New year, new me, new LC_CTYPE=C that isn't a headache?

Rebased, NEWS added w/your comments for v7.

 NEWS                              |   7 ++
 iconv/Makefile                    |   2 +-
 iconv/gconv_builtin.h             |   8 ++
 iconv/gconv_int.h                 |   8 ++
 iconv/gconv_posix.c               |  96 ++++++++++++++++++++
 iconv/tst-iconv_prog.sh           |  43 +++++++++
 iconvdata/tst-tables.sh           |   1 +
 inet/tst-idna_name_classify.c     |   6 +-
 locale/tst-C-locale.c             |  44 +++++++++
 localedata/charmaps/POSIX         | 136 ++++++++++++++++++++++++++++
 localedata/locales/POSIX          | 143 +++++++++++++++++++++++++++++-
 stdio-common/Makefile             |   1 +
 stdio-common/tst-printf-bz25691.c |   2 +
 wcsmbs/wcsmbsload.c               |  14 +--
 14 files changed, 500 insertions(+), 11 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/NEWS b/NEWS
index 5c276dacb3..fbe70e45e8 100644
--- a/NEWS
+++ b/NEWS
@@ -13,6 +13,13 @@ Major new features:
   getent with --no-addrconfig may contain addresses of families not
   configured on the current host i.e. as-if you had not passed
   AI_ADDRCONFIG to getaddrinfo calls.
+* The default/"POSIX"/"C" locale's character set is now "POSIX",
+  instead of "ANSI_X3.4-1968" ‒ this is a new fully-reversible
+  8-bit transparent encoding for compatibility with POSIX Issue 7 TC 2,
+  identity-mapping bytes in the ASCII [0, 0x7F] range,
+  and mapping [0x80, 0xFF] bytes to [<U+DF80>, <U+DFFF>].
+  The standard now requires the "POSIX"/"C" locale to have an encoding
+  with these features ‒ 8-bit transparency and a continuous collation sequence
 
 Deprecated and removed features, and other changes affecting compatibility:
 
diff --git a/iconv/Makefile b/iconv/Makefile
index afb3fb7bdb..b61e130377 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 35608b4461..d2dcdd44a3 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index da792a95f5..6e4f3aeca2 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -299,6 +301,12 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+     attribute_hidden;
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..412d910109
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,96 @@
+/* "POSIX" locale transformation functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdf00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	if (val > 0x7f)							      \
+	  val -= 0xdf00;						      \
+	*outptr++ = val;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index 76400cddfc..6a416134e2 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdf\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index ddac85daa1..badce3e4ca 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bb1c0b5331..0f645cbca5 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index d4c22b749a..67bb33be07 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -20,6 +20,7 @@
 #include <langinfo.h>
 #include <limits.h>
 #include <locale.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include <wchar.h>
@@ -229,6 +230,49 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+  for(int i = 0; i <= 0xff; ++i)
+    {
+      unsigned char bs[] = {i, 0};
+      mbstate_t ctx = {};
+      wchar_t wc = -1, exp = i <= 0x7f ? i : (0xdf00 + i);
+      size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);
+      if (sz != !!i)
+	{
+	  printf ("mbrtowc(%02hhx) width in locale %s wrong "
+		  "(is %zd, should be %d)\n", *bs, locname, sz, !!i);
+	  result = 1;
+	}
+      if (wc != exp)
+	{
+	  printf ("mbrtowc(%02hhx) value in locale %s wrong "
+		  "(is %x, should be %x)\n", *bs, locname, wc, exp);
+	  result = 1;
+	}
+    }
+
+  for (int i = 0; i <= 0xffff; ++i)
+    {
+      bool expok = (i <= 0x7f) || (i >= 0xdf80 && i <= 0xdfff);
+      size_t expsz = expok ? 1 : (size_t) -1;
+      unsigned char expob = expok ? (i & 0xff) : (unsigned char) -1;
+
+      unsigned char ob = -1;
+      mbstate_t ctx = {};
+      size_t sz = wcrtomb ((char *) &ob, i, &ctx);
+      if (sz != expsz)
+	{
+	  printf ("wcrtomb(%x) width in locale %s wrong "
+		  "(is %zd, should be %zd)\n", i, locname, sz, expsz);
+	  result = 1;
+	}
+      if (ob != expob)
+	{
+	  printf ("wcrtomb(%x) value in locale %s wrong "
+		  "(is %hhx, should be %hhx)\n", i, locname, ob, expob);
+	  result = 1;
+	}
+    }
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..c44007ff49
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDF80>..<UDFFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..fc34a6abc1 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the end of the Low Surrogate Area to contain these,
+% yielding [<UDF80>, <UDFFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDF80>
+<UDF81>
+<UDF82>
+<UDF83>
+<UDF84>
+<UDF85>
+<UDF86>
+<UDF87>
+<UDF88>
+<UDF89>
+<UDF8A>
+<UDF8B>
+<UDF8C>
+<UDF8D>
+<UDF8E>
+<UDF8F>
+<UDF90>
+<UDF91>
+<UDF92>
+<UDF93>
+<UDF94>
+<UDF95>
+<UDF96>
+<UDF97>
+<UDF98>
+<UDF99>
+<UDF9A>
+<UDF9B>
+<UDF9C>
+<UDF9D>
+<UDF9E>
+<UDF9F>
+<UDFA0>
+<UDFA1>
+<UDFA2>
+<UDFA3>
+<UDFA4>
+<UDFA5>
+<UDFA6>
+<UDFA7>
+<UDFA8>
+<UDFA9>
+<UDFAA>
+<UDFAB>
+<UDFAC>
+<UDFAD>
+<UDFAE>
+<UDFAF>
+<UDFB0>
+<UDFB1>
+<UDFB2>
+<UDFB3>
+<UDFB4>
+<UDFB5>
+<UDFB6>
+<UDFB7>
+<UDFB8>
+<UDFB9>
+<UDFBA>
+<UDFBB>
+<UDFBC>
+<UDFBD>
+<UDFBE>
+<UDFBF>
+<UDFC0>
+<UDFC1>
+<UDFC2>
+<UDFC3>
+<UDFC4>
+<UDFC5>
+<UDFC6>
+<UDFC7>
+<UDFC8>
+<UDFC9>
+<UDFCA>
+<UDFCB>
+<UDFCC>
+<UDFCD>
+<UDFCE>
+<UDFCF>
+<UDFD0>
+<UDFD1>
+<UDFD2>
+<UDFD3>
+<UDFD4>
+<UDFD5>
+<UDFD6>
+<UDFD7>
+<UDFD8>
+<UDFD9>
+<UDFDA>
+<UDFDB>
+<UDFDC>
+<UDFDD>
+<UDFDE>
+<UDFDF>
+<UDFE0>
+<UDFE1>
+<UDFE2>
+<UDFE3>
+<UDFE4>
+<UDFE5>
+<UDFE6>
+<UDFE7>
+<UDFE8>
+<UDFE9>
+<UDFEA>
+<UDFEB>
+<UDFEC>
+<UDFED>
+<UDFEE>
+<UDFEF>
+<UDFF0>
+<UDFF1>
+<UDFF2>
+<UDFF3>
+<UDFF4>
+<UDFF5>
+<UDFF6>
+<UDFF7>
+<UDFF8>
+<UDFF9>
+<UDFFA>
+<UDFFB>
+<UDFFC>
+<UDFFD>
+<UDFFE>
+<UDFFF>
 order_end
 %
 END LC_COLLATE
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 6e9d104524..4015a1217e 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -333,6 +333,7 @@ $(objpfx)test-vfprintf.out: $(gen-locales)
 $(objpfx)tst-grouping.out: $(gen-locales)
 $(objpfx)tst-grouping2.out: $(gen-locales)
 $(objpfx)tst-grouping_iterator.out: $(gen-locales)
+$(objpfx)tst-printf-bz25691-mem.out: $(gen-locales)
 $(objpfx)tst-sprintf.out: $(gen-locales)
 $(objpfx)tst-sscanf.out: $(gen-locales)
 $(objpfx)tst-swprintf.out: $(gen-locales)
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index e6fa2433fa..f953e0d956 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 7b338b6775..0e6297f38d 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDF00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v9] POSIX locale covers every byte [BZ# 29511]
  2023-01-09 15:17                   ` [PATCH v8] " наб
@ 2023-02-07 14:16                     ` наб
  2023-02-13 14:52                       ` Florian Weimer
  0 siblings, 1 reply; 29+ messages in thread
From: наб @ 2023-02-07 14:16 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, Victor Stinner

[-- Attachment #1: Type: text/plain, Size: 24180 bytes --]

This largely duplicates the ASCII code with the error path changed

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDF00> we land at the tail-end of the
Unicode Low Surrogate Area at DC00-DFFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
and match musl

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
Clean rebase, reposting after a month.

 NEWS                              |   7 ++
 iconv/Makefile                    |   2 +-
 iconv/gconv_builtin.h             |   8 ++
 iconv/gconv_int.h                 |   8 ++
 iconv/gconv_posix.c               |  96 ++++++++++++++++++++
 iconv/tst-iconv_prog.sh           |  43 +++++++++
 iconvdata/tst-tables.sh           |   1 +
 inet/tst-idna_name_classify.c     |   6 +-
 locale/tst-C-locale.c             |  44 +++++++++
 localedata/charmaps/POSIX         | 136 ++++++++++++++++++++++++++++
 localedata/locales/POSIX          | 143 +++++++++++++++++++++++++++++-
 stdio-common/Makefile             |   1 +
 stdio-common/tst-printf-bz25691.c |   2 +
 wcsmbs/wcsmbsload.c               |  14 +--
 14 files changed, 500 insertions(+), 11 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/NEWS b/NEWS
index b227e72c9c..31c74f23f0 100644
--- a/NEWS
+++ b/NEWS
@@ -36,6 +36,13 @@ Major new features:
   getent with --no-addrconfig may contain addresses of families not
   configured on the current host i.e. as-if you had not passed
   AI_ADDRCONFIG to getaddrinfo calls.
+* The default/"POSIX"/"C" locale's character set is now "POSIX",
+  instead of "ANSI_X3.4-1968" ‒ this is a new fully-reversible
+  8-bit transparent encoding for compatibility with POSIX Issue 7 TC 2,
+  identity-mapping bytes in the ASCII [0, 0x7F] range,
+  and mapping [0x80, 0xFF] bytes to [<U+DF80>, <U+DFFF>].
+  The standard now requires the "POSIX"/"C" locale to have an encoding
+  with these features ‒ 8-bit transparency and a continuous collation sequence.
 
 Deprecated and removed features, and other changes affecting compatibility:
 
diff --git a/iconv/Makefile b/iconv/Makefile
index afb3fb7bdb..b61e130377 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 35608b4461..d2dcdd44a3 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index da792a95f5..6e4f3aeca2 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -281,6 +281,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -299,6 +301,12 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+     attribute_hidden;
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..412d910109
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,96 @@
+/* "POSIX" locale transformation functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DF80, U+DFFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdf00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DF80, U+DFFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdf00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DF80, U+DFFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdf80) || val > 0xdfff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	if (val > 0x7f)							      \
+	  val -= 0xdf00;						      \
+	*outptr++ = val;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index 76400cddfc..6a416134e2 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdf\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index ddac85daa1..badce3e4ca 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bb1c0b5331..0f645cbca5 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index d4c22b749a..67bb33be07 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -20,6 +20,7 @@
 #include <langinfo.h>
 #include <limits.h>
 #include <locale.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include <wchar.h>
@@ -229,6 +230,49 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+  for(int i = 0; i <= 0xff; ++i)
+    {
+      unsigned char bs[] = {i, 0};
+      mbstate_t ctx = {};
+      wchar_t wc = -1, exp = i <= 0x7f ? i : (0xdf00 + i);
+      size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);
+      if (sz != !!i)
+	{
+	  printf ("mbrtowc(%02hhx) width in locale %s wrong "
+		  "(is %zd, should be %d)\n", *bs, locname, sz, !!i);
+	  result = 1;
+	}
+      if (wc != exp)
+	{
+	  printf ("mbrtowc(%02hhx) value in locale %s wrong "
+		  "(is %x, should be %x)\n", *bs, locname, wc, exp);
+	  result = 1;
+	}
+    }
+
+  for (int i = 0; i <= 0xffff; ++i)
+    {
+      bool expok = (i <= 0x7f) || (i >= 0xdf80 && i <= 0xdfff);
+      size_t expsz = expok ? 1 : (size_t) -1;
+      unsigned char expob = expok ? (i & 0xff) : (unsigned char) -1;
+
+      unsigned char ob = -1;
+      mbstate_t ctx = {};
+      size_t sz = wcrtomb ((char *) &ob, i, &ctx);
+      if (sz != expsz)
+	{
+	  printf ("wcrtomb(%x) width in locale %s wrong "
+		  "(is %zd, should be %zd)\n", i, locname, sz, expsz);
+	  result = 1;
+	}
+      if (ob != expob)
+	{
+	  printf ("wcrtomb(%x) value in locale %s wrong "
+		  "(is %hhx, should be %hhx)\n", i, locname, ob, expob);
+	  result = 1;
+	}
+    }
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..c44007ff49
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDF80>..<UDFFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..fc34a6abc1 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the end of the Low Surrogate Area to contain these,
+% yielding [<UDF80>, <UDFFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDF80>
+<UDF81>
+<UDF82>
+<UDF83>
+<UDF84>
+<UDF85>
+<UDF86>
+<UDF87>
+<UDF88>
+<UDF89>
+<UDF8A>
+<UDF8B>
+<UDF8C>
+<UDF8D>
+<UDF8E>
+<UDF8F>
+<UDF90>
+<UDF91>
+<UDF92>
+<UDF93>
+<UDF94>
+<UDF95>
+<UDF96>
+<UDF97>
+<UDF98>
+<UDF99>
+<UDF9A>
+<UDF9B>
+<UDF9C>
+<UDF9D>
+<UDF9E>
+<UDF9F>
+<UDFA0>
+<UDFA1>
+<UDFA2>
+<UDFA3>
+<UDFA4>
+<UDFA5>
+<UDFA6>
+<UDFA7>
+<UDFA8>
+<UDFA9>
+<UDFAA>
+<UDFAB>
+<UDFAC>
+<UDFAD>
+<UDFAE>
+<UDFAF>
+<UDFB0>
+<UDFB1>
+<UDFB2>
+<UDFB3>
+<UDFB4>
+<UDFB5>
+<UDFB6>
+<UDFB7>
+<UDFB8>
+<UDFB9>
+<UDFBA>
+<UDFBB>
+<UDFBC>
+<UDFBD>
+<UDFBE>
+<UDFBF>
+<UDFC0>
+<UDFC1>
+<UDFC2>
+<UDFC3>
+<UDFC4>
+<UDFC5>
+<UDFC6>
+<UDFC7>
+<UDFC8>
+<UDFC9>
+<UDFCA>
+<UDFCB>
+<UDFCC>
+<UDFCD>
+<UDFCE>
+<UDFCF>
+<UDFD0>
+<UDFD1>
+<UDFD2>
+<UDFD3>
+<UDFD4>
+<UDFD5>
+<UDFD6>
+<UDFD7>
+<UDFD8>
+<UDFD9>
+<UDFDA>
+<UDFDB>
+<UDFDC>
+<UDFDD>
+<UDFDE>
+<UDFDF>
+<UDFE0>
+<UDFE1>
+<UDFE2>
+<UDFE3>
+<UDFE4>
+<UDFE5>
+<UDFE6>
+<UDFE7>
+<UDFE8>
+<UDFE9>
+<UDFEA>
+<UDFEB>
+<UDFEC>
+<UDFED>
+<UDFEE>
+<UDFEF>
+<UDFF0>
+<UDFF1>
+<UDFF2>
+<UDFF3>
+<UDFF4>
+<UDFF5>
+<UDFF6>
+<UDFF7>
+<UDFF8>
+<UDFF9>
+<UDFFA>
+<UDFFB>
+<UDFFC>
+<UDFFD>
+<UDFFE>
+<UDFFF>
 order_end
 %
 END LC_COLLATE
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 652d9e5f95..a212d5120a 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -336,6 +336,7 @@ $(objpfx)test-vfprintf.out: $(gen-locales)
 $(objpfx)tst-grouping.out: $(gen-locales)
 $(objpfx)tst-grouping2.out: $(gen-locales)
 $(objpfx)tst-grouping_iterator.out: $(gen-locales)
+$(objpfx)tst-printf-bz25691-mem.out: $(gen-locales)
 $(objpfx)tst-sprintf.out: $(gen-locales)
 $(objpfx)tst-sscanf.out: $(gen-locales)
 $(objpfx)tst-swprintf.out: $(gen-locales)
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index e6fa2433fa..f953e0d956 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 7b338b6775..0e6297f38d 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDF00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v9] POSIX locale covers every byte [BZ# 29511]
  2023-02-07 14:16                     ` [PATCH v9] " наб
@ 2023-02-13 14:52                       ` Florian Weimer
  2023-04-26 18:54                         ` наб
  0 siblings, 1 reply; 29+ messages in thread
From: Florian Weimer @ 2023-02-13 14:52 UTC (permalink / raw)
  To: наб; +Cc: libc-alpha, Victor Stinner

* наб:

> This largely duplicates the ASCII code with the error path changed
>
> There are two user-facing changes:
>   * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
>   * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b
>
> Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
>   (a) is 1-byte, stateless, and contains 256 characters
>   (b) they collate in byte order
>   (c) the first 128 characters are equivalent to ASCII (like previous)
> cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
> changes to the standard;
> in short, this means that mbrtowc() must never fail and must return
>   b if b <= 0x7F else ab+c for all bytes b
>   where c is some constant >=0x80
>     and a is a positive integer constant
>
> By strategically picking c=<UDF00> we land at the tail-end of the
> Unicode Low Surrogate Area at DC00-DFFF, described as
>   > Isolated surrogate code points have no interpretation;
>   > consequently, no character code charts or names lists
>   > are provided for this range.
> and match musl

I've thought about this some more, and I don't think this is the
direction we should be going in.

* Add a UTF-8SE charset to glibc: it's UTF-8 with surrogate encoding (in
  the Python style).  It should have the property that it can encode
  every byte string as a string of wchar_t characters, and convert the
  result back.  It's not entirely trivial because we need to handle
  partial UTF-8 sequences at the end of the buffer carefully.  There
  might be some warts regarding EILSEQ handling lurking there.  Like the
  Python approach, it is somewhat imperfect because it's not preserving
  identity under string concatenation, i.e. f(x) || f(y) is not always
  equal to f(x || y), but that's just unavoidable.

* Switch the charset for the default C locale to UTF-8SE.  This matches
  the POSIX requirement that every byte can be encoded.

* Work with POSIX to drop the requirement that the C locale needs to be
  a single-byte locale.

* (Optional, somewhat unrelated.) Add a generic mechanism so that UTF-8
  locales can be used as UTF-8SE without recompilation.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v9] POSIX locale covers every byte [BZ# 29511]
  2023-02-13 14:52                       ` Florian Weimer
@ 2023-04-26 18:54                         ` наб
  2023-04-26 21:27                           ` Florian Weimer
  0 siblings, 1 reply; 29+ messages in thread
From: наб @ 2023-04-26 18:54 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, Victor Stinner

[-- Attachment #1: Type: text/plain, Size: 6803 bytes --]

Hi! Long time, apologies.

On Mon, Feb 13, 2023 at 03:52:06PM +0100, Florian Weimer wrote:
> > This largely duplicates the ASCII code with the error path changed
> >
> > There are two user-facing changes:
> >   * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
> >   * mbrtowc() and friends return b if b <= 0x7F else <UDF00>+b
> >
> > Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
> >   (a) is 1-byte, stateless, and contains 256 characters
> >   (b) they collate in byte order
> >   (c) the first 128 characters are equivalent to ASCII (like previous)
> > cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
> > changes to the standard;
> > in short, this means that mbrtowc() must never fail and must return
> >   b if b <= 0x7F else ab+c for all bytes b
> >   where c is some constant >=0x80
> >     and a is a positive integer constant
> >
> > By strategically picking c=<UDF00> we land at the tail-end of the
> > Unicode Low Surrogate Area at DC00-DFFF, described as
> >   > Isolated surrogate code points have no interpretation;
> >   > consequently, no character code charts or names lists
> >   > are provided for this range.
> > and match musl
> 
> I've thought about this some more, and I don't think this is the
> direction we should be going in.
> 
> * Add a UTF-8SE charset to glibc: it's UTF-8 with surrogate encoding (in
>   the Python style).  It should have the property that it can encode
>   every byte string as a string of wchar_t characters, and convert the
>   result back.  It's not entirely trivial because we need to handle
>   partial UTF-8 sequences at the end of the buffer carefully.  There
>   might be some warts regarding EILSEQ handling lurking there.  Like the
>   Python approach, it is somewhat imperfect because it's not preserving
>   identity under string concatenation, i.e. f(x) || f(y) is not always
>   equal to f(x || y), but that's just unavoidable.
> 
> * Switch the charset for the default C locale to UTF-8SE.  This matches
>   the POSIX requirement that every byte can be encoded.
The main point of LC_CTYPE=POSIX as specified is that it allows you to
process paths (which are sequences of bytes, not characters) in a sane
way ‒ part of that is that collation needs to be correct, so maybe, as a
smoke test, "[a, b, c] < [a, b, c+1] for all a,b,c".

  >>> b'\xc4\xbf'.decode('UTF-8', errors='surrogateescape')
  'Ŀ'
  >>> b'\xc4\xc0'.decode('UTF-8', errors='surrogateescape')
  '\udcc4\udcc0'
  >>>
  >>> [*map(ord, b'\xc4\xbf'.decode('UTF-8', errors='surrogateescape'))]
  [319]
  >>> [*map(ord, b'\xc4\xc0'.decode('UTF-8', errors='surrogateescape'))]
  [56516, 56512]
which, I mean, sure, maybe that's sensible (I wouldn't say so), but
  >>> b'\xef\xbf\xbf'.decode('UTF-8', errors='surrogateescape')
  '\uffff'
  >>> b'\xef\xbf\xc0'.decode('UTF-8', errors='surrogateescape')
  '\udcef\udcbf\udcc0'
  >>>
  >>> [*map(ord, b'\xef\xbf\xbf'.decode('UTF-8', errors='surrogateescape'))]
  [65535]
  >>> [*map(ord, b'\xef\xbf\xc0'.decode('UTF-8', errors='surrogateescape'))]
  [56559, 56511, 56512]

Which means you can't process arbitrary data (pathnames) in a way that
makes sense. In my opinion this would be /worse/ than the current
behaviour, behaving erratically in the presence of Some Data instead of
simply not supporting it.

> * Work with POSIX to drop the requirement that the C locale needs to be
>   a single-byte locale.
That's not going to happen because it's the /only/ way to process paths.
Indeed, XBD 8.2 puts it nicely:
  Users may use the following environment variables to announce specific
  localization requirements to applications.
As a user, I want to be able to announce "each byte is a character,
 in natural ordering". This is what LC_CTYPE=C lets me do. I hope
you'll agree this is a good feature to be support.

POSIX, also, explicitly says that (XBD 8.2):
5499  1. If the LC_ALL environment variable is defined and is not null, the value of LC_ALL shall
5500     be used.
5501  2. If the LC_* environment variable (LC_COLLATE, LC_CTYPE, LC_MESSAGES,
5502     LC_MONETARY, LC_NUMERIC, LC_TIME) is defined and is not null, the value of the
5503     environment variable shall be used to initialize the category that corresponds to the
5504     environment variable.
5505  3. If the LANG environment variable is defined and is not null, the value of the LANG
5506     environment variable shall be used.
5507  4. If the LANG environment variable is not set or is set to the empty string, the
5508     implementation-defined default locale shall be used.
and XBD 7.2:
3643  All implementations shall define a locale as the default locale, to be invoked when no
3644  environment variables are set, or set to the empty string. This default locale can be the POSIX
3645  locale or any other implementation-defined locale. Some implementations may provide facilities
3646  for local installation administrators to set the default locale, customizing it for each location.
3647  POSIX.1-202x does not require such a facility.


To that end, how's about:
  * invent UTF-8SE encoding as you say
  * invent POSIX   encoding like in this patch
    (but move the area to match UTF-8SE probably, it's a good precedent)
  * hook up POSIX to POSIX as in here
  * change the implementation-defined default locale to POSIX-but-UTF-8SE
  * (maybe) change the default locale on entry to main() to POSIX-but-UTF-8SE

POSIX requires that LC_ALL=POSIX is the default on entry to main().
That said, I wouldn't mind violating /that/, since anything we do with it
is backwards-compatible. Maybe it makes sense to do that for programs that
don't call setlocale() at all, and they'll behave better when used
internationally. Or not.

Logically, this translates to:
  * if the user has their native locale selected, use that
  * if the user has explicitly selected the bytewise locale, use that
  * if the user hasn't configured their locales at all,
    assume they want UTF-8 but degrade sensibly
  * (maybe) if the program hasn't been written with locales in mind,
            assume the user will be using it with UTF-8 input but
			degrade sensibly

I think this leaves the wolf full and the sheep alive ‒ the default
behaviour is UTF-8(ish), and can be overridden to full UTF-8 or bytes,
per the user's requirements.

Existing users will thus gain the ability to:
  * process data that's UTF-8 but skip over/retain
    illegal/otherwise-encoded bytes losslessly
    (this makes the sample above a killer feature instead of non-sensible,
     so long as it's an encoding in its own right)
  * correctly process arbitrily-encoded data as bytes

Thoughts?
наб

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH v9] POSIX locale covers every byte [BZ# 29511]
  2023-04-26 18:54                         ` наб
@ 2023-04-26 21:27                           ` Florian Weimer
  2023-04-27  0:17                             ` [PATCH v10] " наб
  0 siblings, 1 reply; 29+ messages in thread
From: Florian Weimer @ 2023-04-26 21:27 UTC (permalink / raw)
  To: наб; +Cc: libc-alpha, Victor Stinner

* наб:

>> I've thought about this some more, and I don't think this is the
>> direction we should be going in.
>> 
>> * Add a UTF-8SE charset to glibc: it's UTF-8 with surrogate encoding (in
>>   the Python style).  It should have the property that it can encode
>>   every byte string as a string of wchar_t characters, and convert the
>>   result back.  It's not entirely trivial because we need to handle
>>   partial UTF-8 sequences at the end of the buffer carefully.  There
>>   might be some warts regarding EILSEQ handling lurking there.  Like the
>>   Python approach, it is somewhat imperfect because it's not preserving
>>   identity under string concatenation, i.e. f(x) || f(y) is not always
>>   equal to f(x || y), but that's just unavoidable.
>> 
>> * Switch the charset for the default C locale to UTF-8SE.  This matches
>>   the POSIX requirement that every byte can be encoded.

> The main point of LC_CTYPE=POSIX as specified is that it allows you to
> process paths (which are sequences of bytes, not characters) in a sane
> way ‒ part of that is that collation needs to be correct, so maybe, as a
> smoke test, "[a, b, c] < [a, b, c+1] for all a,b,c".
>
>   >>> b'\xc4\xbf'.decode('UTF-8', errors='surrogateescape')
>   'Ŀ'
>   >>> b'\xc4\xc0'.decode('UTF-8', errors='surrogateescape')
>   '\udcc4\udcc0'
>   >>>
>   >>> [*map(ord, b'\xc4\xbf'.decode('UTF-8', errors='surrogateescape'))]
>   [319]
>   >>> [*map(ord, b'\xc4\xc0'.decode('UTF-8', errors='surrogateescape'))]
>   [56516, 56512]
> which, I mean, sure, maybe that's sensible (I wouldn't say so), but
>   >>> b'\xef\xbf\xbf'.decode('UTF-8', errors='surrogateescape')
>   '\uffff'
>   >>> b'\xef\xbf\xc0'.decode('UTF-8', errors='surrogateescape')
>   '\udcef\udcbf\udcc0'
>   >>>
>   >>> [*map(ord, b'\xef\xbf\xbf'.decode('UTF-8', errors='surrogateescape'))]
>   [65535]
>   >>> [*map(ord, b'\xef\xbf\xc0'.decode('UTF-8', errors='surrogateescape'))]
>   [56559, 56511, 56512]
>
> Which means you can't process arbitrary data (pathnames) in a way that
> makes sense. In my opinion this would be /worse/ than the current
> behaviour, behaving erratically in the presence of Some Data instead of
> simply not supporting it.

Sorry for letting this linger for so long from my side, too.

Regarding the above, I'm not sure I find this convincing.  That's just
business as usual with collation?

However, after thinking about this some more, my idea (just use a
liberal UTF-8 variant) does not work given the APIs we have, in the
sense that code that works in C.UTF-8 today will stop working under this
hypothetical new locale.

For example, for mbrlen (S, N, PS), we have this requirement:

     If the first N bytes possibly form a valid multibyte character but
     the character is incomplete, the return value is ‘(size_t) -2’.
     Otherwise the multibyte character sequence is invalid and the
     return value is ‘(size_t) -1’.

If every byte sequence is a valid, then mbrlen can never return
(size_t) -2.  It would have to produce surrogate encoding instead.
But this means that detection of valid but incomplete UTF-8 sequences
(say at buffer boundaries) is no longer possible.  And that can't be
good because we would produce unexpected wide characters around
buffer boundaries.

I think this leaves us with a straight byte encoding, so either
ISO-8859-1 for simplicity (and with the cultural bias it brings), or the
musl-style shifted upper half encoding that your patch implements.

In the end, enabling UTF-8 (or some variant) by default is probably not
that important because it directly impacts mostly the wide character
interfaces.  Those are not widely used for a variety of reasons (one
probably being that our implementation is so incredibly slow).

Thanks,
Florian


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v10] POSIX locale covers every byte [BZ# 29511]
  2023-04-26 21:27                           ` Florian Weimer
@ 2023-04-27  0:17                             ` наб
  2023-04-28 15:43                               ` [PATCH v11] " наб
  0 siblings, 1 reply; 29+ messages in thread
From: наб @ 2023-04-27  0:17 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, Victor Stinner

[-- Attachment #1: Type: text/plain, Size: 33234 bytes --]

This largely duplicates the ASCII code with the error path changed

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDC00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDC00> we land at the same point of the
Unicode Low Surrogate Area at DC00-DCFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
as the Python UTF-8 errors=surrogateescape encoding.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
Clean rebase, NEWS entry moved up to current.

Changes from v9:
  DF00 -> DC00  (mostly did this in preparation for UTF-8SE locally;
                 doesn't really matter where we put it,
                 except DF00 is more aesthetically pleasing)
  _nl_C_codeset (this appears to be the last duplicate usage of
                 "ANSI_X3.4-1968", accd'g to git grep ANSI_X3.4-1968)
  => localedata/tst-c-utf8-consistency.c


On Wed, 26 Apr 2023 23:27:23 +0200, Florian Weimer wrote:
> Regarding the above, I'm not sure I find this convincing.  That's just
> business as usual with collation?
Not sure what you mean by this. I suppose you could define a collation
table that orders [65535] < [56559, 56511, 56512] &c. for all
combinations, but that sounds unpracticable.

> However, after thinking about this some more, my idea (just use a
> liberal UTF-8 variant) does not work given the APIs we have, in the
> sense that code that works in C.UTF-8 today will stop working under this
> hypothetical new locale.
>
> For example, for mbrlen (S, N, PS), we have this requirement:
>
>      If the first N bytes possibly form a valid multibyte character but
>      the character is incomplete, the return value is ‘(size_t) -2’.
>      Otherwise the multibyte character sequence is invalid and the
>      return value is ‘(size_t) -1’.
>
> If every byte sequence is a valid, then mbrlen can never return
> (size_t) -2.  It would have to produce surrogate encoding instead.
> But this means that detection of valid but incomplete UTF-8 sequences
> (say at buffer boundaries) is no longer possible.  And that can't be
> good because we would produce unexpected wide characters around
> buffer boundaries.
Yeah, this is where I got as well; it's actually rather trivial
to convert the UTF-8 translator to UTF-8SE with
    else if (__glibc_unlikely (wc >= 0xdc00 && wc <= 0xdcff))		      \
      /* It was an invalid byte.  */					      \
      *outptr++ = (unsigned char) wc - 0xdc00;				      \
and
	  errout:							      \
	    /* Eject the bytes constituing the invalid sequence as 0xDC__ */  \
	    do								      \
	      {								      \
		*((uint32_t *) outptr) = 0xdc00 + *inptr++;		      \
		outptr += sizeof (uint32_t);				      \
		--i;							      \
	      }								      \
	    while (i && __glibc_likely (outptr + sizeof (uint32_t) <= outend));\
	    if (i)							      \
	      {								      \
		result = __GCONV_FULL_OUTPUT;				      \
		break;							      \
	      }								      \
	    continue;							      \
but you're still left with the result = __GCONV_INCOMPLETE_INPUT; path.

> I think this leaves us with a straight byte encoding, so either
> ISO-8859-1 for simplicity (and with the cultural bias it brings),
:)  That, and if we return "real" characters, this will get abused
    instantly and permanently (plus, idk how it'd behave, but if the
    letters within that range were to behave like letters, then people
    will not run setlocale(), and if they behaved like opaque blobs but
    were actually letters then that can be reasonably construed "broken";
    all avoided by returning blob-like blobs).

Also, even regardless of the western-european momento brought by that,
proliferating a non-UTF-8 encoding in $(date +%Y) is questionable.

> or the
> musl-style shifted upper half encoding that your patch implements.
idk, may just be me, but I've historically found constant offsets
much simpler than French.

 NEWS                                |   8 ++
 iconv/Makefile                      |   2 +-
 iconv/gconv_builtin.h               |   8 ++
 iconv/gconv_int.h                   |   8 ++
 iconv/gconv_posix.c                 |  96 +++++++++++++++++++
 iconv/tst-iconv_prog.sh             |  43 +++++++++
 iconvdata/tst-tables.sh             |   1 +
 inet/tst-idna_name_classify.c       |   6 +-
 locale/C_name.c                     |   2 +-
 locale/tst-C-locale.c               |  44 +++++++++
 localedata/charmaps/POSIX           | 136 ++++++++++++++++++++++++++
 localedata/locales/POSIX            | 143 +++++++++++++++++++++++++++-
 localedata/tst-c-utf8-consistency.c |  24 ++---
 stdio-common/Makefile               |   1 +
 stdio-common/tst-printf-bz25691.c   |   2 +
 wcsmbs/wcsmbsload.c                 |  14 +--
 16 files changed, 514 insertions(+), 24 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/NEWS b/NEWS
index 40964d2ee0..fd2462e622 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,14 @@ Major new features:
 * A new tunable, glibc.pthread.stack_hugetlb, can be used to disable
   Transparent Huge Pages (THP) in stack allocation at pthread_create.
 
+* The default/"POSIX"/"C" locale's character set is now "POSIX",
+  instead of "ANSI_X3.4-1968" ‒ this is a new fully-reversible
+  8-bit transparent encoding for compatibility with POSIX Issue 7 TC 2,
+  identity-mapping bytes in the ASCII [0, 0x7F] range,
+  and mapping [0x80, 0xFF] bytes to [<U+DC80>, <U+DCFF>].
+  The standard now requires the "POSIX"/"C" locale to have an encoding
+  with these features ‒ 8-bit transparency and a continuous collation sequence.
+
 Deprecated and removed features, and other changes affecting compatibility:
 
 * In the Linux kernel for the hppa/parisc architecture some of the
diff --git a/iconv/Makefile b/iconv/Makefile
index afb3fb7bdb..b61e130377 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 35608b4461..d2dcdd44a3 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 4b247a815f..a46402b772 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -309,6 +309,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -327,6 +329,12 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DC80, U+DCFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+     attribute_hidden;
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..c576191531
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,96 @@
+/* "POSIX" locale transformation functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DC80, U+DCFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdc00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DC80, U+DCFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdc00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DC80, U+DCFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdc80) || val > 0xdcff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	if (val > 0x7f)							      \
+	  val -= 0xdc00;						      \
+	*outptr++ = val;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index 76400cddfc..c757fb2c40 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdc\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index ddac85daa1..badce3e4ca 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bb1c0b5331..0f645cbca5 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/C_name.c b/locale/C_name.c
index 7612544f2f..2f52636828 100644
--- a/locale/C_name.c
+++ b/locale/C_name.c
@@ -8,4 +8,4 @@ const char _nl_C_name[] = "C";
 const char _nl_POSIX_name[] = "POSIX";
 
 /* The standard codeset.  */
-const char _nl_C_codeset[] = "ANSI_X3.4-1968";
+const char _nl_C_codeset[] = "POSIX";
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index d4c22b749a..a25bff4910 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -20,6 +20,7 @@
 #include <langinfo.h>
 #include <limits.h>
 #include <locale.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include <wchar.h>
@@ -229,6 +230,49 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+  for(int i = 0; i <= 0xff; ++i)
+    {
+      unsigned char bs[] = {i, 0};
+      mbstate_t ctx = {};
+      wchar_t wc = -1, exp = i <= 0x7f ? i : (0xdc00 + i);
+      size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);
+      if (sz != !!i)
+	{
+	  printf ("mbrtowc(%02hhx) width in locale %s wrong "
+		  "(is %zd, should be %d)\n", *bs, locname, sz, !!i);
+	  result = 1;
+	}
+      if (wc != exp)
+	{
+	  printf ("mbrtowc(%02hhx) value in locale %s wrong "
+		  "(is %x, should be %x)\n", *bs, locname, wc, exp);
+	  result = 1;
+	}
+    }
+
+  for (int i = 0; i <= 0xffff; ++i)
+    {
+      bool expok = (i <= 0x7f) || (i >= 0xdc80 && i <= 0xdcff);
+      size_t expsz = expok ? 1 : (size_t) -1;
+      unsigned char expob = expok ? (i & 0xff) : (unsigned char) -1;
+
+      unsigned char ob = -1;
+      mbstate_t ctx = {};
+      size_t sz = wcrtomb ((char *) &ob, i, &ctx);
+      if (sz != expsz)
+	{
+	  printf ("wcrtomb(%x) width in locale %s wrong "
+		  "(is %zd, should be %zd)\n", i, locname, sz, expsz);
+	  result = 1;
+	}
+      if (ob != expob)
+	{
+	  printf ("wcrtomb(%x) value in locale %s wrong "
+		  "(is %hhx, should be %hhx)\n", i, locname, ob, expob);
+	  result = 1;
+	}
+    }
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..69bdf6b485
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDC80>..<UDCFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..45f2fa0b31 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the same part of the Low Surrogate Area as Python
+% to contain these, yielding [<UDC80>, <UDCFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDC80>
+<UDC81>
+<UDC82>
+<UDC83>
+<UDC84>
+<UDC85>
+<UDC86>
+<UDC87>
+<UDC88>
+<UDC89>
+<UDC8A>
+<UDC8B>
+<UDC8C>
+<UDC8D>
+<UDC8E>
+<UDC8F>
+<UDC90>
+<UDC91>
+<UDC92>
+<UDC93>
+<UDC94>
+<UDC95>
+<UDC96>
+<UDC97>
+<UDC98>
+<UDC99>
+<UDC9A>
+<UDC9B>
+<UDC9C>
+<UDC9D>
+<UDC9E>
+<UDC9F>
+<UDCA0>
+<UDCA1>
+<UDCA2>
+<UDCA3>
+<UDCA4>
+<UDCA5>
+<UDCA6>
+<UDCA7>
+<UDCA8>
+<UDCA9>
+<UDCAA>
+<UDCAB>
+<UDCAC>
+<UDCAD>
+<UDCAE>
+<UDCAF>
+<UDCB0>
+<UDCB1>
+<UDCB2>
+<UDCB3>
+<UDCB4>
+<UDCB5>
+<UDCB6>
+<UDCB7>
+<UDCB8>
+<UDCB9>
+<UDCBA>
+<UDCBB>
+<UDCBC>
+<UDCBD>
+<UDCBE>
+<UDCBF>
+<UDCC0>
+<UDCC1>
+<UDCC2>
+<UDCC3>
+<UDCC4>
+<UDCC5>
+<UDCC6>
+<UDCC7>
+<UDCC8>
+<UDCC9>
+<UDCCA>
+<UDCCB>
+<UDCCC>
+<UDCCD>
+<UDCCE>
+<UDCCF>
+<UDCD0>
+<UDCD1>
+<UDCD2>
+<UDCD3>
+<UDCD4>
+<UDCD5>
+<UDCD6>
+<UDCD7>
+<UDCD8>
+<UDCD9>
+<UDCDA>
+<UDCDB>
+<UDCDC>
+<UDCDD>
+<UDCDE>
+<UDCDF>
+<UDCE0>
+<UDCE1>
+<UDCE2>
+<UDCE3>
+<UDCE4>
+<UDCE5>
+<UDCE6>
+<UDCE7>
+<UDCE8>
+<UDCE9>
+<UDCEA>
+<UDCEB>
+<UDCEC>
+<UDCED>
+<UDCEE>
+<UDCEF>
+<UDCF0>
+<UDCF1>
+<UDCF2>
+<UDCF3>
+<UDCF4>
+<UDCF5>
+<UDCF6>
+<UDCF7>
+<UDCF8>
+<UDCF9>
+<UDCFA>
+<UDCFB>
+<UDCFC>
+<UDCFD>
+<UDCFE>
+<UDCFF>
 order_end
 %
 END LC_COLLATE
diff --git a/localedata/tst-c-utf8-consistency.c b/localedata/tst-c-utf8-consistency.c
index 1625e4dd0b..bd2f56834c 100644
--- a/localedata/tst-c-utf8-consistency.c
+++ b/localedata/tst-c-utf8-consistency.c
@@ -253,7 +253,7 @@ one_pass (void)
   TEST_COMPARE_STRING_WIDE (wstr (_NL_W_DATE_FMT), wstr_utf8 (_NL_W_DATE_FMT));
 
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_TIME_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_TIME_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_TIME_CODESET), "UTF-8");
 
   TEST_COMPARE_STRING (str (ALTMON_1), str_utf8 (ALTMON_1));
@@ -321,11 +321,11 @@ one_pass (void)
                             wstr_utf8 (_NL_WABALTMON_12));
 
   /* LC_COLLATE.  Mostly untested, only expected differences.  */
-  TEST_COMPARE_STRING (str (_NL_COLLATE_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_COLLATE_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_COLLATE_CODESET), "UTF-8");
 
   /* LC_CTYPE.  Mostly untested, only expected differences.  */
-  TEST_COMPARE_STRING (str (CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (CODESET), "UTF-8");
 
   /* LC_MONETARY.  */
@@ -401,7 +401,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_MONETARY_THOUSANDS_SEP_WC),
                 word_utf8 (_NL_MONETARY_THOUSANDS_SEP_WC));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MONETARY_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MONETARY_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MONETARY_CODESET), "UTF-8");
 
   /* LC_NUMERIC.  */
@@ -416,7 +416,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_NUMERIC_THOUSANDS_SEP_WC),
                 word_utf8 (_NL_NUMERIC_THOUSANDS_SEP_WC));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_NUMERIC_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_NUMERIC_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_NUMERIC_CODESET), "UTF-8");
 
   /* LC_MESSAGES.  */
@@ -426,7 +426,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (YESSTR), str_utf8 (YESSTR));
   TEST_COMPARE_STRING (str (NOSTR), str_utf8 (NOSTR));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MESSAGES_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MESSAGES_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MESSAGES_CODESET), "UTF-8");
 
   /* LC_PAPER.  */
@@ -434,7 +434,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_PAPER_HEIGHT), word_utf8 (_NL_PAPER_HEIGHT));
   TEST_COMPARE (word (_NL_PAPER_WIDTH), word_utf8 (_NL_PAPER_WIDTH));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_PAPER_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_PAPER_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_PAPER_CODESET), "UTF-8");
 
   /* LC_NAME.  */
@@ -452,7 +452,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_NAME_NAME_MS),
                        str_utf8 (_NL_NAME_NAME_MS));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_NAME_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_NAME_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_NAME_CODESET), "UTF-8");
 
   /* LC_ADDRESS.  */
@@ -482,7 +482,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_ADDRESS_LANG_LIB),
                        str_utf8 (_NL_ADDRESS_LANG_LIB));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_ADDRESS_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_ADDRESS_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_ADDRESS_CODESET), "UTF-8");
 
   /* LC_TELEPHONE.  */
@@ -496,7 +496,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_TELEPHONE_INT_PREFIX),
                        str_utf8 (_NL_TELEPHONE_INT_PREFIX));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_TELEPHONE_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_TELEPHONE_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_TELEPHONE_CODESET), "UTF-8");
 
   /* LC_MEASUREMENT.  */
@@ -504,7 +504,7 @@ one_pass (void)
   TEST_COMPARE (byte (_NL_MEASUREMENT_MEASUREMENT),
                 byte_utf8 (_NL_MEASUREMENT_MEASUREMENT));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MEASUREMENT_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MEASUREMENT_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MEASUREMENT_CODESET), "UTF-8");
 
   /* LC_IDENTIFICATION is skipped since C.UTF-8 is distinct from C.  */
@@ -512,7 +512,7 @@ one_pass (void)
   /* _NL_IDENTIFICATION_CATEGORY cannot be tested because it is a
      string array.  */
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_IDENTIFICATION_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_IDENTIFICATION_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_IDENTIFICATION_CODESET), "UTF-8");
 }
 
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 7cf8d814ea..ad38093b56 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -349,6 +349,7 @@ $(objpfx)test-vfprintf.out: $(gen-locales)
 $(objpfx)tst-grouping.out: $(gen-locales)
 $(objpfx)tst-grouping2.out: $(gen-locales)
 $(objpfx)tst-grouping_iterator.out: $(gen-locales)
+$(objpfx)tst-printf-bz25691-mem.out: $(gen-locales)
 $(objpfx)tst-sprintf.out: $(gen-locales)
 $(objpfx)tst-sscanf.out: $(gen-locales)
 $(objpfx)tst-swprintf.out: $(gen-locales)
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index e6fa2433fa..f953e0d956 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 7b338b6775..86666e8231 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDC00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v11] POSIX locale covers every byte [BZ# 29511]
  2023-04-27  0:17                             ` [PATCH v10] " наб
@ 2023-04-28 15:43                               ` наб
  2023-05-07 22:53                                 ` [PATCH v12] " наб
  0 siblings, 1 reply; 29+ messages in thread
From: наб @ 2023-04-28 15:43 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, Victor Stinner

[-- Attachment #1: Type: text/plain, Size: 30690 bytes --]

This largely duplicates the ASCII code with the error path changed

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDC00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDC00> we land at the same point of the
Unicode Low Surrogate Area at DC00-DCFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
as the Python UTF-8 errors=surrogateescape encoding.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
Range-diff against v10:
1:  0c1053bfa8 ! 1:  e20517811b POSIX locale covers every byte [BZ# 29511]
    @@ iconv/gconv_posix.c (new)
     +      }									      \
     +    else								      \
     +      {									      \
    -+	if (val > 0x7f)							      \
    -+	  val -= 0xdc00;						      \
    -+	*outptr++ = val;						      \
    ++	*outptr++ = val & 0xff;						      \
     +	inptr += sizeof (uint32_t);					      \
     +      }									      \
     +  }

By simplifying this, a simple benchmark
(iconv()ing a 664M UCS-4LE file on tmpfs
 (as obtained by encoding /lib/systemd/systemd a few times)
 to an anonymous mapping)
goes from
  Time (mean ± σ):      1.361 s ±  0.079 s    [User: 0.860 s, System: 0.501 s]
  Range (min … max):    1.217 s …  1.544 s    100 runs
to
  Time (mean ± σ):      1.284 s ±  0.076 s    [User: 0.801 s, System: 0.483 s]
  Range (min … max):    1.163 s …  1.486 s    100 runs
for like 5.6%ish or .116ms/MB, which is definitely worthwhile
(Xeon E5645, DDR3-1600).

 NEWS                                |   8 ++
 iconv/Makefile                      |   2 +-
 iconv/gconv_builtin.h               |   8 ++
 iconv/gconv_int.h                   |   8 ++
 iconv/gconv_posix.c                 |  94 ++++++++++++++++++
 iconv/tst-iconv_prog.sh             |  43 +++++++++
 iconvdata/tst-tables.sh             |   1 +
 inet/tst-idna_name_classify.c       |   6 +-
 locale/C_name.c                     |   2 +-
 locale/tst-C-locale.c               |  44 +++++++++
 localedata/charmaps/POSIX           | 136 ++++++++++++++++++++++++++
 localedata/locales/POSIX            | 143 +++++++++++++++++++++++++++-
 localedata/tst-c-utf8-consistency.c |  24 ++---
 stdio-common/Makefile               |   1 +
 stdio-common/tst-printf-bz25691.c   |   2 +
 wcsmbs/wcsmbsload.c                 |  14 +--
 16 files changed, 512 insertions(+), 24 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/NEWS b/NEWS
index 40964d2ee0..fd2462e622 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,14 @@ Major new features:
 * A new tunable, glibc.pthread.stack_hugetlb, can be used to disable
   Transparent Huge Pages (THP) in stack allocation at pthread_create.
 
+* The default/"POSIX"/"C" locale's character set is now "POSIX",
+  instead of "ANSI_X3.4-1968" ‒ this is a new fully-reversible
+  8-bit transparent encoding for compatibility with POSIX Issue 7 TC 2,
+  identity-mapping bytes in the ASCII [0, 0x7F] range,
+  and mapping [0x80, 0xFF] bytes to [<U+DC80>, <U+DCFF>].
+  The standard now requires the "POSIX"/"C" locale to have an encoding
+  with these features ‒ 8-bit transparency and a continuous collation sequence.
+
 Deprecated and removed features, and other changes affecting compatibility:
 
 * In the Linux kernel for the hppa/parisc architecture some of the
diff --git a/iconv/Makefile b/iconv/Makefile
index afb3fb7bdb..b61e130377 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 35608b4461..d2dcdd44a3 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 4b247a815f..a46402b772 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -309,6 +309,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -327,6 +329,12 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DC80, U+DCFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+     attribute_hidden;
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..885929baca
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,94 @@
+/* "POSIX" locale transformation functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DC80, U+DCFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdc00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DC80, U+DCFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdc00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DC80, U+DCFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdc80) || val > 0xdcff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	*outptr++ = val & 0xff;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index 76400cddfc..c757fb2c40 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdc\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index ddac85daa1..badce3e4ca 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bb1c0b5331..0f645cbca5 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/C_name.c b/locale/C_name.c
index 7612544f2f..2f52636828 100644
--- a/locale/C_name.c
+++ b/locale/C_name.c
@@ -8,4 +8,4 @@ const char _nl_C_name[] = "C";
 const char _nl_POSIX_name[] = "POSIX";
 
 /* The standard codeset.  */
-const char _nl_C_codeset[] = "ANSI_X3.4-1968";
+const char _nl_C_codeset[] = "POSIX";
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index d4c22b749a..a25bff4910 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -20,6 +20,7 @@
 #include <langinfo.h>
 #include <limits.h>
 #include <locale.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include <wchar.h>
@@ -229,6 +230,49 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+  for(int i = 0; i <= 0xff; ++i)
+    {
+      unsigned char bs[] = {i, 0};
+      mbstate_t ctx = {};
+      wchar_t wc = -1, exp = i <= 0x7f ? i : (0xdc00 + i);
+      size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);
+      if (sz != !!i)
+	{
+	  printf ("mbrtowc(%02hhx) width in locale %s wrong "
+		  "(is %zd, should be %d)\n", *bs, locname, sz, !!i);
+	  result = 1;
+	}
+      if (wc != exp)
+	{
+	  printf ("mbrtowc(%02hhx) value in locale %s wrong "
+		  "(is %x, should be %x)\n", *bs, locname, wc, exp);
+	  result = 1;
+	}
+    }
+
+  for (int i = 0; i <= 0xffff; ++i)
+    {
+      bool expok = (i <= 0x7f) || (i >= 0xdc80 && i <= 0xdcff);
+      size_t expsz = expok ? 1 : (size_t) -1;
+      unsigned char expob = expok ? (i & 0xff) : (unsigned char) -1;
+
+      unsigned char ob = -1;
+      mbstate_t ctx = {};
+      size_t sz = wcrtomb ((char *) &ob, i, &ctx);
+      if (sz != expsz)
+	{
+	  printf ("wcrtomb(%x) width in locale %s wrong "
+		  "(is %zd, should be %zd)\n", i, locname, sz, expsz);
+	  result = 1;
+	}
+      if (ob != expob)
+	{
+	  printf ("wcrtomb(%x) value in locale %s wrong "
+		  "(is %hhx, should be %hhx)\n", i, locname, ob, expob);
+	  result = 1;
+	}
+    }
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..69bdf6b485
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDC80>..<UDCFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..45f2fa0b31 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the same part of the Low Surrogate Area as Python
+% to contain these, yielding [<UDC80>, <UDCFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDC80>
+<UDC81>
+<UDC82>
+<UDC83>
+<UDC84>
+<UDC85>
+<UDC86>
+<UDC87>
+<UDC88>
+<UDC89>
+<UDC8A>
+<UDC8B>
+<UDC8C>
+<UDC8D>
+<UDC8E>
+<UDC8F>
+<UDC90>
+<UDC91>
+<UDC92>
+<UDC93>
+<UDC94>
+<UDC95>
+<UDC96>
+<UDC97>
+<UDC98>
+<UDC99>
+<UDC9A>
+<UDC9B>
+<UDC9C>
+<UDC9D>
+<UDC9E>
+<UDC9F>
+<UDCA0>
+<UDCA1>
+<UDCA2>
+<UDCA3>
+<UDCA4>
+<UDCA5>
+<UDCA6>
+<UDCA7>
+<UDCA8>
+<UDCA9>
+<UDCAA>
+<UDCAB>
+<UDCAC>
+<UDCAD>
+<UDCAE>
+<UDCAF>
+<UDCB0>
+<UDCB1>
+<UDCB2>
+<UDCB3>
+<UDCB4>
+<UDCB5>
+<UDCB6>
+<UDCB7>
+<UDCB8>
+<UDCB9>
+<UDCBA>
+<UDCBB>
+<UDCBC>
+<UDCBD>
+<UDCBE>
+<UDCBF>
+<UDCC0>
+<UDCC1>
+<UDCC2>
+<UDCC3>
+<UDCC4>
+<UDCC5>
+<UDCC6>
+<UDCC7>
+<UDCC8>
+<UDCC9>
+<UDCCA>
+<UDCCB>
+<UDCCC>
+<UDCCD>
+<UDCCE>
+<UDCCF>
+<UDCD0>
+<UDCD1>
+<UDCD2>
+<UDCD3>
+<UDCD4>
+<UDCD5>
+<UDCD6>
+<UDCD7>
+<UDCD8>
+<UDCD9>
+<UDCDA>
+<UDCDB>
+<UDCDC>
+<UDCDD>
+<UDCDE>
+<UDCDF>
+<UDCE0>
+<UDCE1>
+<UDCE2>
+<UDCE3>
+<UDCE4>
+<UDCE5>
+<UDCE6>
+<UDCE7>
+<UDCE8>
+<UDCE9>
+<UDCEA>
+<UDCEB>
+<UDCEC>
+<UDCED>
+<UDCEE>
+<UDCEF>
+<UDCF0>
+<UDCF1>
+<UDCF2>
+<UDCF3>
+<UDCF4>
+<UDCF5>
+<UDCF6>
+<UDCF7>
+<UDCF8>
+<UDCF9>
+<UDCFA>
+<UDCFB>
+<UDCFC>
+<UDCFD>
+<UDCFE>
+<UDCFF>
 order_end
 %
 END LC_COLLATE
diff --git a/localedata/tst-c-utf8-consistency.c b/localedata/tst-c-utf8-consistency.c
index 1625e4dd0b..bd2f56834c 100644
--- a/localedata/tst-c-utf8-consistency.c
+++ b/localedata/tst-c-utf8-consistency.c
@@ -253,7 +253,7 @@ one_pass (void)
   TEST_COMPARE_STRING_WIDE (wstr (_NL_W_DATE_FMT), wstr_utf8 (_NL_W_DATE_FMT));
 
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_TIME_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_TIME_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_TIME_CODESET), "UTF-8");
 
   TEST_COMPARE_STRING (str (ALTMON_1), str_utf8 (ALTMON_1));
@@ -321,11 +321,11 @@ one_pass (void)
                             wstr_utf8 (_NL_WABALTMON_12));
 
   /* LC_COLLATE.  Mostly untested, only expected differences.  */
-  TEST_COMPARE_STRING (str (_NL_COLLATE_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_COLLATE_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_COLLATE_CODESET), "UTF-8");
 
   /* LC_CTYPE.  Mostly untested, only expected differences.  */
-  TEST_COMPARE_STRING (str (CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (CODESET), "UTF-8");
 
   /* LC_MONETARY.  */
@@ -401,7 +401,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_MONETARY_THOUSANDS_SEP_WC),
                 word_utf8 (_NL_MONETARY_THOUSANDS_SEP_WC));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MONETARY_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MONETARY_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MONETARY_CODESET), "UTF-8");
 
   /* LC_NUMERIC.  */
@@ -416,7 +416,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_NUMERIC_THOUSANDS_SEP_WC),
                 word_utf8 (_NL_NUMERIC_THOUSANDS_SEP_WC));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_NUMERIC_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_NUMERIC_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_NUMERIC_CODESET), "UTF-8");
 
   /* LC_MESSAGES.  */
@@ -426,7 +426,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (YESSTR), str_utf8 (YESSTR));
   TEST_COMPARE_STRING (str (NOSTR), str_utf8 (NOSTR));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MESSAGES_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MESSAGES_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MESSAGES_CODESET), "UTF-8");
 
   /* LC_PAPER.  */
@@ -434,7 +434,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_PAPER_HEIGHT), word_utf8 (_NL_PAPER_HEIGHT));
   TEST_COMPARE (word (_NL_PAPER_WIDTH), word_utf8 (_NL_PAPER_WIDTH));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_PAPER_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_PAPER_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_PAPER_CODESET), "UTF-8");
 
   /* LC_NAME.  */
@@ -452,7 +452,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_NAME_NAME_MS),
                        str_utf8 (_NL_NAME_NAME_MS));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_NAME_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_NAME_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_NAME_CODESET), "UTF-8");
 
   /* LC_ADDRESS.  */
@@ -482,7 +482,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_ADDRESS_LANG_LIB),
                        str_utf8 (_NL_ADDRESS_LANG_LIB));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_ADDRESS_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_ADDRESS_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_ADDRESS_CODESET), "UTF-8");
 
   /* LC_TELEPHONE.  */
@@ -496,7 +496,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_TELEPHONE_INT_PREFIX),
                        str_utf8 (_NL_TELEPHONE_INT_PREFIX));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_TELEPHONE_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_TELEPHONE_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_TELEPHONE_CODESET), "UTF-8");
 
   /* LC_MEASUREMENT.  */
@@ -504,7 +504,7 @@ one_pass (void)
   TEST_COMPARE (byte (_NL_MEASUREMENT_MEASUREMENT),
                 byte_utf8 (_NL_MEASUREMENT_MEASUREMENT));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MEASUREMENT_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MEASUREMENT_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MEASUREMENT_CODESET), "UTF-8");
 
   /* LC_IDENTIFICATION is skipped since C.UTF-8 is distinct from C.  */
@@ -512,7 +512,7 @@ one_pass (void)
   /* _NL_IDENTIFICATION_CATEGORY cannot be tested because it is a
      string array.  */
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_IDENTIFICATION_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_IDENTIFICATION_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_IDENTIFICATION_CODESET), "UTF-8");
 }
 
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 7cf8d814ea..ad38093b56 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -349,6 +349,7 @@ $(objpfx)test-vfprintf.out: $(gen-locales)
 $(objpfx)tst-grouping.out: $(gen-locales)
 $(objpfx)tst-grouping2.out: $(gen-locales)
 $(objpfx)tst-grouping_iterator.out: $(gen-locales)
+$(objpfx)tst-printf-bz25691-mem.out: $(gen-locales)
 $(objpfx)tst-sprintf.out: $(gen-locales)
 $(objpfx)tst-sscanf.out: $(gen-locales)
 $(objpfx)tst-swprintf.out: $(gen-locales)
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index e6fa2433fa..f953e0d956 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 7b338b6775..86666e8231 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDC00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v12] POSIX locale covers every byte [BZ# 29511]
  2023-04-28 15:43                               ` [PATCH v11] " наб
@ 2023-05-07 22:53                                 ` наб
  2023-05-29 13:54                                   ` [PATCH v13] " наб
  0 siblings, 1 reply; 29+ messages in thread
From: наб @ 2023-05-07 22:53 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, Victor Stinner

[-- Attachment #1: Type: text/plain, Size: 30017 bytes --]

This largely duplicates the ASCII code with the error path changed

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDC00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDC00> we land at the same point of the
Unicode Low Surrogate Area at DC00-DCFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
as the Python UTF-8 errors=surrogateescape encoding.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
Resending after a week per guidelines.

Rebased, conflict only in NEWS.

 NEWS                                |   9 ++
 iconv/Makefile                      |   2 +-
 iconv/gconv_builtin.h               |   8 ++
 iconv/gconv_int.h                   |   8 ++
 iconv/gconv_posix.c                 |  94 ++++++++++++++++++
 iconv/tst-iconv_prog.sh             |  43 +++++++++
 iconvdata/tst-tables.sh             |   1 +
 inet/tst-idna_name_classify.c       |   6 +-
 locale/C_name.c                     |   2 +-
 locale/tst-C-locale.c               |  44 +++++++++
 localedata/charmaps/POSIX           | 136 ++++++++++++++++++++++++++
 localedata/locales/POSIX            | 143 +++++++++++++++++++++++++++-
 localedata/tst-c-utf8-consistency.c |  24 ++---
 stdio-common/Makefile               |   1 +
 stdio-common/tst-printf-bz25691.c   |   2 +
 wcsmbs/wcsmbsload.c                 |  14 +--
 16 files changed, 513 insertions(+), 24 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/NEWS b/NEWS
index a52c17c677..06c042bebb 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,7 @@ Major new features:
 * A new tunable, glibc.pthread.stack_hugetlb, can be used to disable
   Transparent Huge Pages (THP) in stack allocation at pthread_create.
 
+
 * Support for x86_64 running on Hurd has been added.  This port requires
   as least binutils 2.40 and GCC 13:
 
@@ -35,6 +36,14 @@ Major new features:
   The symbol names follow the AArch64 vector ABI, they are declared
   in math.h and have to be called manually at this point.
 
+* The default/"POSIX"/"C" locale's character set is now "POSIX",
+  instead of "ANSI_X3.4-1968" ‒ this is a new fully-reversible
+  8-bit transparent encoding for compatibility with POSIX Issue 7 TC 2,
+  identity-mapping bytes in the ASCII [0, 0x7F] range,
+  and mapping [0x80, 0xFF] bytes to [<U+DC80>, <U+DCFF>].
+  The standard now requires the "POSIX"/"C" locale to have an encoding
+  with these features ‒ 8-bit transparency and a continuous collation sequence.
+
 Deprecated and removed features, and other changes affecting compatibility:
 
 * In the Linux kernel for the hppa/parisc architecture some of the
diff --git a/iconv/Makefile b/iconv/Makefile
index afb3fb7bdb..b61e130377 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 35608b4461..d2dcdd44a3 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 4b247a815f..a46402b772 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -309,6 +309,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -327,6 +329,12 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DC80, U+DCFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+     attribute_hidden;
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..885929baca
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,94 @@
+/* "POSIX" locale transformation functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DC80, U+DCFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdc00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DC80, U+DCFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdc00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DC80, U+DCFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdc80) || val > 0xdcff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	*outptr++ = val & 0xff;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index 76400cddfc..c757fb2c40 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdc\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index ddac85daa1..badce3e4ca 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bb1c0b5331..0f645cbca5 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/C_name.c b/locale/C_name.c
index 7612544f2f..2f52636828 100644
--- a/locale/C_name.c
+++ b/locale/C_name.c
@@ -8,4 +8,4 @@ const char _nl_C_name[] = "C";
 const char _nl_POSIX_name[] = "POSIX";
 
 /* The standard codeset.  */
-const char _nl_C_codeset[] = "ANSI_X3.4-1968";
+const char _nl_C_codeset[] = "POSIX";
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index d4c22b749a..a25bff4910 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -20,6 +20,7 @@
 #include <langinfo.h>
 #include <limits.h>
 #include <locale.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include <wchar.h>
@@ -229,6 +230,49 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+  for(int i = 0; i <= 0xff; ++i)
+    {
+      unsigned char bs[] = {i, 0};
+      mbstate_t ctx = {};
+      wchar_t wc = -1, exp = i <= 0x7f ? i : (0xdc00 + i);
+      size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);
+      if (sz != !!i)
+	{
+	  printf ("mbrtowc(%02hhx) width in locale %s wrong "
+		  "(is %zd, should be %d)\n", *bs, locname, sz, !!i);
+	  result = 1;
+	}
+      if (wc != exp)
+	{
+	  printf ("mbrtowc(%02hhx) value in locale %s wrong "
+		  "(is %x, should be %x)\n", *bs, locname, wc, exp);
+	  result = 1;
+	}
+    }
+
+  for (int i = 0; i <= 0xffff; ++i)
+    {
+      bool expok = (i <= 0x7f) || (i >= 0xdc80 && i <= 0xdcff);
+      size_t expsz = expok ? 1 : (size_t) -1;
+      unsigned char expob = expok ? (i & 0xff) : (unsigned char) -1;
+
+      unsigned char ob = -1;
+      mbstate_t ctx = {};
+      size_t sz = wcrtomb ((char *) &ob, i, &ctx);
+      if (sz != expsz)
+	{
+	  printf ("wcrtomb(%x) width in locale %s wrong "
+		  "(is %zd, should be %zd)\n", i, locname, sz, expsz);
+	  result = 1;
+	}
+      if (ob != expob)
+	{
+	  printf ("wcrtomb(%x) value in locale %s wrong "
+		  "(is %hhx, should be %hhx)\n", i, locname, ob, expob);
+	  result = 1;
+	}
+    }
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..69bdf6b485
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDC80>..<UDCFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..45f2fa0b31 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the same part of the Low Surrogate Area as Python
+% to contain these, yielding [<UDC80>, <UDCFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDC80>
+<UDC81>
+<UDC82>
+<UDC83>
+<UDC84>
+<UDC85>
+<UDC86>
+<UDC87>
+<UDC88>
+<UDC89>
+<UDC8A>
+<UDC8B>
+<UDC8C>
+<UDC8D>
+<UDC8E>
+<UDC8F>
+<UDC90>
+<UDC91>
+<UDC92>
+<UDC93>
+<UDC94>
+<UDC95>
+<UDC96>
+<UDC97>
+<UDC98>
+<UDC99>
+<UDC9A>
+<UDC9B>
+<UDC9C>
+<UDC9D>
+<UDC9E>
+<UDC9F>
+<UDCA0>
+<UDCA1>
+<UDCA2>
+<UDCA3>
+<UDCA4>
+<UDCA5>
+<UDCA6>
+<UDCA7>
+<UDCA8>
+<UDCA9>
+<UDCAA>
+<UDCAB>
+<UDCAC>
+<UDCAD>
+<UDCAE>
+<UDCAF>
+<UDCB0>
+<UDCB1>
+<UDCB2>
+<UDCB3>
+<UDCB4>
+<UDCB5>
+<UDCB6>
+<UDCB7>
+<UDCB8>
+<UDCB9>
+<UDCBA>
+<UDCBB>
+<UDCBC>
+<UDCBD>
+<UDCBE>
+<UDCBF>
+<UDCC0>
+<UDCC1>
+<UDCC2>
+<UDCC3>
+<UDCC4>
+<UDCC5>
+<UDCC6>
+<UDCC7>
+<UDCC8>
+<UDCC9>
+<UDCCA>
+<UDCCB>
+<UDCCC>
+<UDCCD>
+<UDCCE>
+<UDCCF>
+<UDCD0>
+<UDCD1>
+<UDCD2>
+<UDCD3>
+<UDCD4>
+<UDCD5>
+<UDCD6>
+<UDCD7>
+<UDCD8>
+<UDCD9>
+<UDCDA>
+<UDCDB>
+<UDCDC>
+<UDCDD>
+<UDCDE>
+<UDCDF>
+<UDCE0>
+<UDCE1>
+<UDCE2>
+<UDCE3>
+<UDCE4>
+<UDCE5>
+<UDCE6>
+<UDCE7>
+<UDCE8>
+<UDCE9>
+<UDCEA>
+<UDCEB>
+<UDCEC>
+<UDCED>
+<UDCEE>
+<UDCEF>
+<UDCF0>
+<UDCF1>
+<UDCF2>
+<UDCF3>
+<UDCF4>
+<UDCF5>
+<UDCF6>
+<UDCF7>
+<UDCF8>
+<UDCF9>
+<UDCFA>
+<UDCFB>
+<UDCFC>
+<UDCFD>
+<UDCFE>
+<UDCFF>
 order_end
 %
 END LC_COLLATE
diff --git a/localedata/tst-c-utf8-consistency.c b/localedata/tst-c-utf8-consistency.c
index 1625e4dd0b..bd2f56834c 100644
--- a/localedata/tst-c-utf8-consistency.c
+++ b/localedata/tst-c-utf8-consistency.c
@@ -253,7 +253,7 @@ one_pass (void)
   TEST_COMPARE_STRING_WIDE (wstr (_NL_W_DATE_FMT), wstr_utf8 (_NL_W_DATE_FMT));
 
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_TIME_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_TIME_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_TIME_CODESET), "UTF-8");
 
   TEST_COMPARE_STRING (str (ALTMON_1), str_utf8 (ALTMON_1));
@@ -321,11 +321,11 @@ one_pass (void)
                             wstr_utf8 (_NL_WABALTMON_12));
 
   /* LC_COLLATE.  Mostly untested, only expected differences.  */
-  TEST_COMPARE_STRING (str (_NL_COLLATE_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_COLLATE_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_COLLATE_CODESET), "UTF-8");
 
   /* LC_CTYPE.  Mostly untested, only expected differences.  */
-  TEST_COMPARE_STRING (str (CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (CODESET), "UTF-8");
 
   /* LC_MONETARY.  */
@@ -401,7 +401,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_MONETARY_THOUSANDS_SEP_WC),
                 word_utf8 (_NL_MONETARY_THOUSANDS_SEP_WC));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MONETARY_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MONETARY_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MONETARY_CODESET), "UTF-8");
 
   /* LC_NUMERIC.  */
@@ -416,7 +416,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_NUMERIC_THOUSANDS_SEP_WC),
                 word_utf8 (_NL_NUMERIC_THOUSANDS_SEP_WC));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_NUMERIC_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_NUMERIC_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_NUMERIC_CODESET), "UTF-8");
 
   /* LC_MESSAGES.  */
@@ -426,7 +426,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (YESSTR), str_utf8 (YESSTR));
   TEST_COMPARE_STRING (str (NOSTR), str_utf8 (NOSTR));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MESSAGES_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MESSAGES_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MESSAGES_CODESET), "UTF-8");
 
   /* LC_PAPER.  */
@@ -434,7 +434,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_PAPER_HEIGHT), word_utf8 (_NL_PAPER_HEIGHT));
   TEST_COMPARE (word (_NL_PAPER_WIDTH), word_utf8 (_NL_PAPER_WIDTH));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_PAPER_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_PAPER_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_PAPER_CODESET), "UTF-8");
 
   /* LC_NAME.  */
@@ -452,7 +452,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_NAME_NAME_MS),
                        str_utf8 (_NL_NAME_NAME_MS));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_NAME_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_NAME_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_NAME_CODESET), "UTF-8");
 
   /* LC_ADDRESS.  */
@@ -482,7 +482,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_ADDRESS_LANG_LIB),
                        str_utf8 (_NL_ADDRESS_LANG_LIB));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_ADDRESS_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_ADDRESS_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_ADDRESS_CODESET), "UTF-8");
 
   /* LC_TELEPHONE.  */
@@ -496,7 +496,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_TELEPHONE_INT_PREFIX),
                        str_utf8 (_NL_TELEPHONE_INT_PREFIX));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_TELEPHONE_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_TELEPHONE_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_TELEPHONE_CODESET), "UTF-8");
 
   /* LC_MEASUREMENT.  */
@@ -504,7 +504,7 @@ one_pass (void)
   TEST_COMPARE (byte (_NL_MEASUREMENT_MEASUREMENT),
                 byte_utf8 (_NL_MEASUREMENT_MEASUREMENT));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MEASUREMENT_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MEASUREMENT_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MEASUREMENT_CODESET), "UTF-8");
 
   /* LC_IDENTIFICATION is skipped since C.UTF-8 is distinct from C.  */
@@ -512,7 +512,7 @@ one_pass (void)
   /* _NL_IDENTIFICATION_CATEGORY cannot be tested because it is a
      string array.  */
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_IDENTIFICATION_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_IDENTIFICATION_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_IDENTIFICATION_CODESET), "UTF-8");
 }
 
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 7cf8d814ea..ad38093b56 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -349,6 +349,7 @@ $(objpfx)test-vfprintf.out: $(gen-locales)
 $(objpfx)tst-grouping.out: $(gen-locales)
 $(objpfx)tst-grouping2.out: $(gen-locales)
 $(objpfx)tst-grouping_iterator.out: $(gen-locales)
+$(objpfx)tst-printf-bz25691-mem.out: $(gen-locales)
 $(objpfx)tst-sprintf.out: $(gen-locales)
 $(objpfx)tst-sscanf.out: $(gen-locales)
 $(objpfx)tst-swprintf.out: $(gen-locales)
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index e6fa2433fa..f953e0d956 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 7b338b6775..86666e8231 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDC00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [PATCH v13] POSIX locale covers every byte [BZ# 29511]
  2023-05-07 22:53                                 ` [PATCH v12] " наб
@ 2023-05-29 13:54                                   ` наб
  0 siblings, 0 replies; 29+ messages in thread
From: наб @ 2023-05-29 13:54 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha, Victor Stinner

[-- Attachment #1: Type: text/plain, Size: 29670 bytes --]

This largely duplicates the ASCII code with the error path changed

There are two user-facing changes:
  * nl_langinfo(CODESET) is "POSIX" instead of "ANSI_X3.4-1968"
  * mbrtowc() and friends return b if b <= 0x7F else <UDC00>+b

Since Issue 7 TC 2/Issue 8, the C/POSIX locale, effectively,
  (a) is 1-byte, stateless, and contains 256 characters
  (b) they collate in byte order
  (c) the first 128 characters are equivalent to ASCII (like previous)
cf. https://www.austingroupbugs.net/view.php?id=663 for a summary of
changes to the standard;
in short, this means that mbrtowc() must never fail and must return
  b if b <= 0x7F else ab+c for all bytes b
  where c is some constant >=0x80
    and a is a positive integer constant

By strategically picking c=<UDC00> we land at the same point of the
Unicode Low Surrogate Area at DC00-DCFF, described as
  > Isolated surrogate code points have no interpretation;
  > consequently, no character code charts or names lists
  > are provided for this range.
as the Python UTF-8 errors=surrogateescape encoding.

Signed-off-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
---
Clean rebase on current master.

 NEWS                                |   8 ++
 iconv/Makefile                      |   2 +-
 iconv/gconv_builtin.h               |   8 ++
 iconv/gconv_int.h                   |   8 ++
 iconv/gconv_posix.c                 |  94 ++++++++++++++++++
 iconv/tst-iconv_prog.sh             |  43 +++++++++
 iconvdata/tst-tables.sh             |   1 +
 inet/tst-idna_name_classify.c       |   6 +-
 locale/C_name.c                     |   2 +-
 locale/tst-C-locale.c               |  44 +++++++++
 localedata/charmaps/POSIX           | 136 ++++++++++++++++++++++++++
 localedata/locales/POSIX            | 143 +++++++++++++++++++++++++++-
 localedata/tst-c-utf8-consistency.c |  24 ++---
 stdio-common/Makefile               |   1 +
 stdio-common/tst-printf-bz25691.c   |   2 +
 wcsmbs/wcsmbsload.c                 |  14 +--
 16 files changed, 512 insertions(+), 24 deletions(-)
 create mode 100644 iconv/gconv_posix.c
 create mode 100644 localedata/charmaps/POSIX

diff --git a/NEWS b/NEWS
index a52c17c677..4fdc39b14e 100644
--- a/NEWS
+++ b/NEWS
@@ -35,6 +35,14 @@ Major new features:
   The symbol names follow the AArch64 vector ABI, they are declared
   in math.h and have to be called manually at this point.
 
+* The default/"POSIX"/"C" locale's character set is now "POSIX",
+  instead of "ANSI_X3.4-1968" ‒ this is a new fully-reversible
+  8-bit transparent encoding for compatibility with POSIX Issue 7 TC 2,
+  identity-mapping bytes in the ASCII [0, 0x7F] range,
+  and mapping [0x80, 0xFF] bytes to [<U+DC80>, <U+DCFF>].
+  The standard now requires the "POSIX"/"C" locale to have an encoding
+  with these features ‒ 8-bit transparency and a continuous collation sequence.
+
 Deprecated and removed features, and other changes affecting compatibility:
 
 * In the Linux kernel for the hppa/parisc architecture some of the
diff --git a/iconv/Makefile b/iconv/Makefile
index afb3fb7bdb..b61e130377 100644
--- a/iconv/Makefile
+++ b/iconv/Makefile
@@ -25,7 +25,7 @@ include ../Makeconfig
 headers		= iconv.h gconv.h
 routines	= iconv_open iconv iconv_close \
 		  gconv_open gconv gconv_close gconv_db gconv_conf \
-		  gconv_builtin gconv_simple gconv_trans gconv_cache
+		  gconv_builtin gconv_simple gconv_posix gconv_trans gconv_cache
 routines	+= gconv_dl gconv_charset
 
 vpath %.c ../locale/programs ../intl
diff --git a/iconv/gconv_builtin.h b/iconv/gconv_builtin.h
index 35608b4461..d2dcdd44a3 100644
--- a/iconv/gconv_builtin.h
+++ b/iconv/gconv_builtin.h
@@ -89,6 +89,14 @@ BUILTIN_TRANSFORMATION ("INTERNAL", "ANSI_X3.4-1968//", 1, "=INTERNAL->ascii",
 			__gconv_transform_internal_ascii, NULL, 4, 4, 1, 1)
 
 
+BUILTIN_TRANSFORMATION ("POSIX//", "INTERNAL", 1, "=posix->INTERNAL",
+			__gconv_transform_posix_internal, __gconv_btwoc_posix,
+			1, 1, 4, 4)
+
+BUILTIN_TRANSFORMATION ("INTERNAL", "POSIX//", 1, "=INTERNAL->posix",
+			__gconv_transform_internal_posix, NULL, 4, 4, 1, 1)
+
+
 #if BYTE_ORDER == BIG_ENDIAN
 BUILTIN_ALIAS ("UNICODEBIG//", "ISO-10646/UCS2/")
 BUILTIN_ALIAS ("UCS-2BE//", "ISO-10646/UCS2/")
diff --git a/iconv/gconv_int.h b/iconv/gconv_int.h
index 19d042faff..3d0889b321 100644
--- a/iconv/gconv_int.h
+++ b/iconv/gconv_int.h
@@ -309,6 +309,8 @@ extern int __gconv_compare_alias (const char *name1, const char *name2)
 
 __BUILTIN_TRANSFORM (__gconv_transform_ascii_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_ascii);
+__BUILTIN_TRANSFORM (__gconv_transform_posix_internal);
+__BUILTIN_TRANSFORM (__gconv_transform_internal_posix);
 __BUILTIN_TRANSFORM (__gconv_transform_utf8_internal);
 __BUILTIN_TRANSFORM (__gconv_transform_internal_utf8);
 __BUILTIN_TRANSFORM (__gconv_transform_ucs2_internal);
@@ -327,6 +329,12 @@ __BUILTIN_TRANSFORM (__gconv_transform_utf16_internal);
    only ASCII characters.  */
 extern wint_t __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c);
 
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DC80, U+DCFF].  */
+extern wint_t __gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+     attribute_hidden;
+
 #endif
 
 __END_DECLS
diff --git a/iconv/gconv_posix.c b/iconv/gconv_posix.c
new file mode 100644
index 0000000000..885929baca
--- /dev/null
+++ b/iconv/gconv_posix.c
@@ -0,0 +1,94 @@
+/* "POSIX" locale transformation functions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <gconv_int.h>
+
+
+/* Specialized conversion function for a single byte to INTERNAL,
+   identity-mapping bytes [0, 0x7F], and moving [0x80, 0xFF] into the end
+   of the Low Surrogate Area at [U+DC80, U+DCFF].  */
+wint_t
+__gconv_btwoc_posix (struct __gconv_step *step, unsigned char c)
+{
+  if (c < 0x80)
+    return c;
+  else
+    return 0xdc00 + c;
+}
+
+
+/* Convert from {[0, 0x7F] => ISO 646-IRV; [0x80, 0xFF] => [U+DC80, U+DCFF]}
+   to the internal (UCS4-like) format.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		1
+#define MIN_NEEDED_TO		4
+#define FROM_DIRECTION		1
+#define FROM_LOOP		posix_internal_loop
+#define TO_LOOP			posix_internal_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_posix_internal
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    if (__glibc_unlikely (*inptr > '\x7f'))				      \
+      *((uint32_t *) outptr) = 0xdc00 + *inptr++;			      \
+    else								      \
+      *((uint32_t *) outptr) = *inptr++;				      \
+    outptr += sizeof (uint32_t);					      \
+  }
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
+
+
+/* Convert from the internal (UCS4-like) format to
+   {ISO 646-IRV => [0, 0x7F]; [U+DC80, U+DCFF] => [0x80, 0xFF]}.  */
+#define DEFINE_INIT		0
+#define DEFINE_FINI		0
+#define MIN_NEEDED_FROM		4
+#define MIN_NEEDED_TO		1
+#define FROM_DIRECTION		1
+#define FROM_LOOP		internal_posix_loop
+#define TO_LOOP			internal_posix_loop /* This is not used.  */
+#define FUNCTION_NAME		__gconv_transform_internal_posix
+#define ONE_DIRECTION		1
+
+#define MIN_NEEDED_INPUT	MIN_NEEDED_FROM
+#define MIN_NEEDED_OUTPUT	MIN_NEEDED_TO
+#define LOOPFCT			FROM_LOOP
+#define BODY \
+  {									      \
+    uint32_t val = *((const uint32_t *) inptr);				      \
+    if (__glibc_unlikely ((val > 0x7f && val < 0xdc80) || val > 0xdcff))      \
+      {									      \
+	UNICODE_TAG_HANDLER (val, 4);					      \
+	STANDARD_TO_LOOP_ERR_HANDLER (4);				      \
+      }									      \
+    else								      \
+      {									      \
+	*outptr++ = val & 0xff;						      \
+	inptr += sizeof (uint32_t);					      \
+      }									      \
+  }
+#define LOOP_NEED_FLAGS
+#include <iconv/loop.c>
+#include <iconv/skeleton.c>
diff --git a/iconv/tst-iconv_prog.sh b/iconv/tst-iconv_prog.sh
index 76400cddfc..c757fb2c40 100644
--- a/iconv/tst-iconv_prog.sh
+++ b/iconv/tst-iconv_prog.sh
@@ -285,3 +285,46 @@ for errorcommand in "${errorarray[@]}"; do
   execute_test
   check_errtest_result
 done
+
+allbytes ()
+{
+  for (( i = 0; i <= 255; i++ )); do
+    printf '\'"$(printf "%o" "$i")"
+  done
+}
+
+allucs4be ()
+{
+  for (( i = 0; i <= 127; i++ )); do
+    printf '\0\0\0\'"$(printf "%o" "$i")"
+  done
+  for (( i = 128; i <= 255; i++ )); do
+    printf '\0\0\xdc\'"$(printf "%o" "$i")"
+  done
+}
+
+check_posix_result ()
+{
+  if [ $? -eq 0 ]; then
+    result=PASS
+  else
+    result=FAIL
+  fi
+
+  echo "$result: from \"$1\", to: \"$2\""
+
+  if [ "$result" != "PASS" ]; then
+    exit 1
+  fi
+}
+
+check_posix_encoding ()
+{
+  eval PROG=\"$ICONV\"
+  allbytes  | $PROG -f POSIX -t UCS-4BE | cmp -s - <(allucs4be)
+  check_posix_result POSIX UCS-4BE
+  allucs4be | $PROG -f UCS-4BE -t POSIX | cmp -s - <(allbytes)
+  check_posix_result UCS-4BE POSIX
+}
+
+check_posix_encoding
diff --git a/iconvdata/tst-tables.sh b/iconvdata/tst-tables.sh
index ddac85daa1..badce3e4ca 100755
--- a/iconvdata/tst-tables.sh
+++ b/iconvdata/tst-tables.sh
@@ -31,6 +31,7 @@ cat <<EOF |
   # Keep this list in the same order as gconv-modules.
   #
   # charset name    table name          comment
+  POSIX
   ASCII             ANSI_X3.4-1968
   ISO646-GB         BS_4730
   ISO646-CA         CSA_Z243.4-1985-1
diff --git a/inet/tst-idna_name_classify.c b/inet/tst-idna_name_classify.c
index bb1c0b5331..0f645cbca5 100644
--- a/inet/tst-idna_name_classify.c
+++ b/inet/tst-idna_name_classify.c
@@ -37,11 +37,11 @@ do_test (void)
   puts ("info: C locale tests");
   locale_insensitive_tests ();
   TEST_COMPARE (__idna_name_classify ("abc\200def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
   TEST_COMPARE (__idna_name_classify ("abc\200\\def"),
-                idna_name_encoding_error);
+                idna_name_nonascii_backslash);
   TEST_COMPARE (__idna_name_classify ("abc\377def"),
-                idna_name_encoding_error);
+                idna_name_nonascii);
 
   puts ("info: en_US.ISO-8859-1 locale tests");
   if (setlocale (LC_CTYPE, "en_US.ISO-8859-1") == 0)
diff --git a/locale/C_name.c b/locale/C_name.c
index 7612544f2f..2f52636828 100644
--- a/locale/C_name.c
+++ b/locale/C_name.c
@@ -8,4 +8,4 @@ const char _nl_C_name[] = "C";
 const char _nl_POSIX_name[] = "POSIX";
 
 /* The standard codeset.  */
-const char _nl_C_codeset[] = "ANSI_X3.4-1968";
+const char _nl_C_codeset[] = "POSIX";
diff --git a/locale/tst-C-locale.c b/locale/tst-C-locale.c
index d4c22b749a..a25bff4910 100644
--- a/locale/tst-C-locale.c
+++ b/locale/tst-C-locale.c
@@ -20,6 +20,7 @@
 #include <langinfo.h>
 #include <limits.h>
 #include <locale.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <string.h>
 #include <wchar.h>
@@ -229,6 +230,49 @@ run_test (const char *locname)
   STRTEST (YESSTR, "");
   STRTEST (NOSTR, "");
 
+  for(int i = 0; i <= 0xff; ++i)
+    {
+      unsigned char bs[] = {i, 0};
+      mbstate_t ctx = {};
+      wchar_t wc = -1, exp = i <= 0x7f ? i : (0xdc00 + i);
+      size_t sz = mbrtowc(&wc, (char *) bs, 1, &ctx);
+      if (sz != !!i)
+	{
+	  printf ("mbrtowc(%02hhx) width in locale %s wrong "
+		  "(is %zd, should be %d)\n", *bs, locname, sz, !!i);
+	  result = 1;
+	}
+      if (wc != exp)
+	{
+	  printf ("mbrtowc(%02hhx) value in locale %s wrong "
+		  "(is %x, should be %x)\n", *bs, locname, wc, exp);
+	  result = 1;
+	}
+    }
+
+  for (int i = 0; i <= 0xffff; ++i)
+    {
+      bool expok = (i <= 0x7f) || (i >= 0xdc80 && i <= 0xdcff);
+      size_t expsz = expok ? 1 : (size_t) -1;
+      unsigned char expob = expok ? (i & 0xff) : (unsigned char) -1;
+
+      unsigned char ob = -1;
+      mbstate_t ctx = {};
+      size_t sz = wcrtomb ((char *) &ob, i, &ctx);
+      if (sz != expsz)
+	{
+	  printf ("wcrtomb(%x) width in locale %s wrong "
+		  "(is %zd, should be %zd)\n", i, locname, sz, expsz);
+	  result = 1;
+	}
+      if (ob != expob)
+	{
+	  printf ("wcrtomb(%x) value in locale %s wrong "
+		  "(is %hhx, should be %hhx)\n", i, locname, ob, expob);
+	  result = 1;
+	}
+    }
+
   /* Test the new locale mechanisms.  */
   loc = newlocale (LC_ALL_MASK, locname, NULL);
   if (loc == NULL)
diff --git a/localedata/charmaps/POSIX b/localedata/charmaps/POSIX
new file mode 100644
index 0000000000..69bdf6b485
--- /dev/null
+++ b/localedata/charmaps/POSIX
@@ -0,0 +1,136 @@
+<code_set_name> POSIX
+<comment_char> %
+<escape_char> /
+% source: cf. localedata/locales/POSIX, LC_COLLATE
+
+CHARMAP
+<U0000>     /x00         NULL (NUL)
+<U0001>     /x01         START OF HEADING (SOH)
+<U0002>     /x02         START OF TEXT (STX)
+<U0003>     /x03         END OF TEXT (ETX)
+<U0004>     /x04         END OF TRANSMISSION (EOT)
+<U0005>     /x05         ENQUIRY (ENQ)
+<U0006>     /x06         ACKNOWLEDGE (ACK)
+<U0007>     /x07         BELL (BEL)
+<U0008>     /x08         BACKSPACE (BS)
+<U0009>     /x09         CHARACTER TABULATION (HT)
+<U000A>     /x0a         LINE FEED (LF)
+<U000B>     /x0b         LINE TABULATION (VT)
+<U000C>     /x0c         FORM FEED (FF)
+<U000D>     /x0d         CARRIAGE RETURN (CR)
+<U000E>     /x0e         SHIFT OUT (SO)
+<U000F>     /x0f         SHIFT IN (SI)
+<U0010>     /x10         DATALINK ESCAPE (DLE)
+<U0011>     /x11         DEVICE CONTROL ONE (DC1)
+<U0012>     /x12         DEVICE CONTROL TWO (DC2)
+<U0013>     /x13         DEVICE CONTROL THREE (DC3)
+<U0014>     /x14         DEVICE CONTROL FOUR (DC4)
+<U0015>     /x15         NEGATIVE ACKNOWLEDGE (NAK)
+<U0016>     /x16         SYNCHRONOUS IDLE (SYN)
+<U0017>     /x17         END OF TRANSMISSION BLOCK (ETB)
+<U0018>     /x18         CANCEL (CAN)
+<U0019>     /x19         END OF MEDIUM (EM)
+<U001A>     /x1a         SUBSTITUTE (SUB)
+<U001B>     /x1b         ESCAPE (ESC)
+<U001C>     /x1c         FILE SEPARATOR (IS4)
+<U001D>     /x1d         GROUP SEPARATOR (IS3)
+<U001E>     /x1e         RECORD SEPARATOR (IS2)
+<U001F>     /x1f         UNIT SEPARATOR (IS1)
+<U0020>     /x20         SPACE
+<U0021>     /x21         EXCLAMATION MARK
+<U0022>     /x22         QUOTATION MARK
+<U0023>     /x23         NUMBER SIGN
+<U0024>     /x24         DOLLAR SIGN
+<U0025>     /x25         PERCENT SIGN
+<U0026>     /x26         AMPERSAND
+<U0027>     /x27         APOSTROPHE
+<U0028>     /x28         LEFT PARENTHESIS
+<U0029>     /x29         RIGHT PARENTHESIS
+<U002A>     /x2a         ASTERISK
+<U002B>     /x2b         PLUS SIGN
+<U002C>     /x2c         COMMA
+<U002D>     /x2d         HYPHEN-MINUS
+<U002E>     /x2e         FULL STOP
+<U002F>     /x2f         SOLIDUS
+<U0030>     /x30         DIGIT ZERO
+<U0031>     /x31         DIGIT ONE
+<U0032>     /x32         DIGIT TWO
+<U0033>     /x33         DIGIT THREE
+<U0034>     /x34         DIGIT FOUR
+<U0035>     /x35         DIGIT FIVE
+<U0036>     /x36         DIGIT SIX
+<U0037>     /x37         DIGIT SEVEN
+<U0038>     /x38         DIGIT EIGHT
+<U0039>     /x39         DIGIT NINE
+<U003A>     /x3a         COLON
+<U003B>     /x3b         SEMICOLON
+<U003C>     /x3c         LESS-THAN SIGN
+<U003D>     /x3d         EQUALS SIGN
+<U003E>     /x3e         GREATER-THAN SIGN
+<U003F>     /x3f         QUESTION MARK
+<U0040>     /x40         COMMERCIAL AT
+<U0041>     /x41         LATIN CAPITAL LETTER A
+<U0042>     /x42         LATIN CAPITAL LETTER B
+<U0043>     /x43         LATIN CAPITAL LETTER C
+<U0044>     /x44         LATIN CAPITAL LETTER D
+<U0045>     /x45         LATIN CAPITAL LETTER E
+<U0046>     /x46         LATIN CAPITAL LETTER F
+<U0047>     /x47         LATIN CAPITAL LETTER G
+<U0048>     /x48         LATIN CAPITAL LETTER H
+<U0049>     /x49         LATIN CAPITAL LETTER I
+<U004A>     /x4a         LATIN CAPITAL LETTER J
+<U004B>     /x4b         LATIN CAPITAL LETTER K
+<U004C>     /x4c         LATIN CAPITAL LETTER L
+<U004D>     /x4d         LATIN CAPITAL LETTER M
+<U004E>     /x4e         LATIN CAPITAL LETTER N
+<U004F>     /x4f         LATIN CAPITAL LETTER O
+<U0050>     /x50         LATIN CAPITAL LETTER P
+<U0051>     /x51         LATIN CAPITAL LETTER Q
+<U0052>     /x52         LATIN CAPITAL LETTER R
+<U0053>     /x53         LATIN CAPITAL LETTER S
+<U0054>     /x54         LATIN CAPITAL LETTER T
+<U0055>     /x55         LATIN CAPITAL LETTER U
+<U0056>     /x56         LATIN CAPITAL LETTER V
+<U0057>     /x57         LATIN CAPITAL LETTER W
+<U0058>     /x58         LATIN CAPITAL LETTER X
+<U0059>     /x59         LATIN CAPITAL LETTER Y
+<U005A>     /x5a         LATIN CAPITAL LETTER Z
+<U005B>     /x5b         LEFT SQUARE BRACKET
+<U005C>     /x5c         REVERSE SOLIDUS
+<U005D>     /x5d         RIGHT SQUARE BRACKET
+<U005E>     /x5e         CIRCUMFLEX ACCENT
+<U005F>     /x5f         LOW LINE
+<U0060>     /x60         GRAVE ACCENT
+<U0061>     /x61         LATIN SMALL LETTER A
+<U0062>     /x62         LATIN SMALL LETTER B
+<U0063>     /x63         LATIN SMALL LETTER C
+<U0064>     /x64         LATIN SMALL LETTER D
+<U0065>     /x65         LATIN SMALL LETTER E
+<U0066>     /x66         LATIN SMALL LETTER F
+<U0067>     /x67         LATIN SMALL LETTER G
+<U0068>     /x68         LATIN SMALL LETTER H
+<U0069>     /x69         LATIN SMALL LETTER I
+<U006A>     /x6a         LATIN SMALL LETTER J
+<U006B>     /x6b         LATIN SMALL LETTER K
+<U006C>     /x6c         LATIN SMALL LETTER L
+<U006D>     /x6d         LATIN SMALL LETTER M
+<U006E>     /x6e         LATIN SMALL LETTER N
+<U006F>     /x6f         LATIN SMALL LETTER O
+<U0070>     /x70         LATIN SMALL LETTER P
+<U0071>     /x71         LATIN SMALL LETTER Q
+<U0072>     /x72         LATIN SMALL LETTER R
+<U0073>     /x73         LATIN SMALL LETTER S
+<U0074>     /x74         LATIN SMALL LETTER T
+<U0075>     /x75         LATIN SMALL LETTER U
+<U0076>     /x76         LATIN SMALL LETTER V
+<U0077>     /x77         LATIN SMALL LETTER W
+<U0078>     /x78         LATIN SMALL LETTER X
+<U0079>     /x79         LATIN SMALL LETTER Y
+<U007A>     /x7a         LATIN SMALL LETTER Z
+<U007B>     /x7b         LEFT CURLY BRACKET
+<U007C>     /x7c         VERTICAL LINE
+<U007D>     /x7d         RIGHT CURLY BRACKET
+<U007E>     /x7e         TILDE
+<U007F>     /x7f         DELETE (DEL)
+<UDC80>..<UDCFF> /x80
+END CHARMAP
diff --git a/localedata/locales/POSIX b/localedata/locales/POSIX
index 7ec7f1c577..45f2fa0b31 100644
--- a/localedata/locales/POSIX
+++ b/localedata/locales/POSIX
@@ -97,6 +97,20 @@ END LC_CTYPE
 LC_COLLATE
 % This is the POSIX Locale definition for the LC_COLLATE category.
 % The order is the same as in the ASCII code set.
+% Values above <DEL> (<U007F>) inserted in order, per Issue 7 TC2,
+% XBD, 7.3.2, LC_COLLATE Category in the POSIX Locale:
+% > All characters not explicitly listed here shall be inserted
+% > in the character collation order after the listed characters
+% > and shall be assigned unique primary weights. If the listed
+% > characters have ASCII encoding, the other characters shall
+% > be in ascending order according to their coded character set values
+% Since Issue 7 TC2 (XBD, 6.2 Character Encoding):
+% > The POSIX locale shall contain 256 single-byte characters [...]
+% (cf. bug 663, 674).
+% this is in contrast to previous issues, which limited the POSIX
+% locale to the Portable Character Set (7-bit ASCII).
+% We use the same part of the Low Surrogate Area as Python
+% to contain these, yielding [<UDC80>, <UDCFF>]
 order_start forward
 <U0000>
 <U0001>
@@ -226,7 +240,134 @@ order_start forward
 <U007D>
 <U007E>
 <U007F>
-UNDEFINED
+<UDC80>
+<UDC81>
+<UDC82>
+<UDC83>
+<UDC84>
+<UDC85>
+<UDC86>
+<UDC87>
+<UDC88>
+<UDC89>
+<UDC8A>
+<UDC8B>
+<UDC8C>
+<UDC8D>
+<UDC8E>
+<UDC8F>
+<UDC90>
+<UDC91>
+<UDC92>
+<UDC93>
+<UDC94>
+<UDC95>
+<UDC96>
+<UDC97>
+<UDC98>
+<UDC99>
+<UDC9A>
+<UDC9B>
+<UDC9C>
+<UDC9D>
+<UDC9E>
+<UDC9F>
+<UDCA0>
+<UDCA1>
+<UDCA2>
+<UDCA3>
+<UDCA4>
+<UDCA5>
+<UDCA6>
+<UDCA7>
+<UDCA8>
+<UDCA9>
+<UDCAA>
+<UDCAB>
+<UDCAC>
+<UDCAD>
+<UDCAE>
+<UDCAF>
+<UDCB0>
+<UDCB1>
+<UDCB2>
+<UDCB3>
+<UDCB4>
+<UDCB5>
+<UDCB6>
+<UDCB7>
+<UDCB8>
+<UDCB9>
+<UDCBA>
+<UDCBB>
+<UDCBC>
+<UDCBD>
+<UDCBE>
+<UDCBF>
+<UDCC0>
+<UDCC1>
+<UDCC2>
+<UDCC3>
+<UDCC4>
+<UDCC5>
+<UDCC6>
+<UDCC7>
+<UDCC8>
+<UDCC9>
+<UDCCA>
+<UDCCB>
+<UDCCC>
+<UDCCD>
+<UDCCE>
+<UDCCF>
+<UDCD0>
+<UDCD1>
+<UDCD2>
+<UDCD3>
+<UDCD4>
+<UDCD5>
+<UDCD6>
+<UDCD7>
+<UDCD8>
+<UDCD9>
+<UDCDA>
+<UDCDB>
+<UDCDC>
+<UDCDD>
+<UDCDE>
+<UDCDF>
+<UDCE0>
+<UDCE1>
+<UDCE2>
+<UDCE3>
+<UDCE4>
+<UDCE5>
+<UDCE6>
+<UDCE7>
+<UDCE8>
+<UDCE9>
+<UDCEA>
+<UDCEB>
+<UDCEC>
+<UDCED>
+<UDCEE>
+<UDCEF>
+<UDCF0>
+<UDCF1>
+<UDCF2>
+<UDCF3>
+<UDCF4>
+<UDCF5>
+<UDCF6>
+<UDCF7>
+<UDCF8>
+<UDCF9>
+<UDCFA>
+<UDCFB>
+<UDCFC>
+<UDCFD>
+<UDCFE>
+<UDCFF>
 order_end
 %
 END LC_COLLATE
diff --git a/localedata/tst-c-utf8-consistency.c b/localedata/tst-c-utf8-consistency.c
index 1625e4dd0b..bd2f56834c 100644
--- a/localedata/tst-c-utf8-consistency.c
+++ b/localedata/tst-c-utf8-consistency.c
@@ -253,7 +253,7 @@ one_pass (void)
   TEST_COMPARE_STRING_WIDE (wstr (_NL_W_DATE_FMT), wstr_utf8 (_NL_W_DATE_FMT));
 
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_TIME_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_TIME_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_TIME_CODESET), "UTF-8");
 
   TEST_COMPARE_STRING (str (ALTMON_1), str_utf8 (ALTMON_1));
@@ -321,11 +321,11 @@ one_pass (void)
                             wstr_utf8 (_NL_WABALTMON_12));
 
   /* LC_COLLATE.  Mostly untested, only expected differences.  */
-  TEST_COMPARE_STRING (str (_NL_COLLATE_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_COLLATE_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_COLLATE_CODESET), "UTF-8");
 
   /* LC_CTYPE.  Mostly untested, only expected differences.  */
-  TEST_COMPARE_STRING (str (CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (CODESET), "UTF-8");
 
   /* LC_MONETARY.  */
@@ -401,7 +401,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_MONETARY_THOUSANDS_SEP_WC),
                 word_utf8 (_NL_MONETARY_THOUSANDS_SEP_WC));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MONETARY_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MONETARY_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MONETARY_CODESET), "UTF-8");
 
   /* LC_NUMERIC.  */
@@ -416,7 +416,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_NUMERIC_THOUSANDS_SEP_WC),
                 word_utf8 (_NL_NUMERIC_THOUSANDS_SEP_WC));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_NUMERIC_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_NUMERIC_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_NUMERIC_CODESET), "UTF-8");
 
   /* LC_MESSAGES.  */
@@ -426,7 +426,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (YESSTR), str_utf8 (YESSTR));
   TEST_COMPARE_STRING (str (NOSTR), str_utf8 (NOSTR));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MESSAGES_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MESSAGES_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MESSAGES_CODESET), "UTF-8");
 
   /* LC_PAPER.  */
@@ -434,7 +434,7 @@ one_pass (void)
   TEST_COMPARE (word (_NL_PAPER_HEIGHT), word_utf8 (_NL_PAPER_HEIGHT));
   TEST_COMPARE (word (_NL_PAPER_WIDTH), word_utf8 (_NL_PAPER_WIDTH));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_PAPER_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_PAPER_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_PAPER_CODESET), "UTF-8");
 
   /* LC_NAME.  */
@@ -452,7 +452,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_NAME_NAME_MS),
                        str_utf8 (_NL_NAME_NAME_MS));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_NAME_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_NAME_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_NAME_CODESET), "UTF-8");
 
   /* LC_ADDRESS.  */
@@ -482,7 +482,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_ADDRESS_LANG_LIB),
                        str_utf8 (_NL_ADDRESS_LANG_LIB));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_ADDRESS_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_ADDRESS_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_ADDRESS_CODESET), "UTF-8");
 
   /* LC_TELEPHONE.  */
@@ -496,7 +496,7 @@ one_pass (void)
   TEST_COMPARE_STRING (str (_NL_TELEPHONE_INT_PREFIX),
                        str_utf8 (_NL_TELEPHONE_INT_PREFIX));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_TELEPHONE_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_TELEPHONE_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_TELEPHONE_CODESET), "UTF-8");
 
   /* LC_MEASUREMENT.  */
@@ -504,7 +504,7 @@ one_pass (void)
   TEST_COMPARE (byte (_NL_MEASUREMENT_MEASUREMENT),
                 byte_utf8 (_NL_MEASUREMENT_MEASUREMENT));
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_MEASUREMENT_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_MEASUREMENT_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_MEASUREMENT_CODESET), "UTF-8");
 
   /* LC_IDENTIFICATION is skipped since C.UTF-8 is distinct from C.  */
@@ -512,7 +512,7 @@ one_pass (void)
   /* _NL_IDENTIFICATION_CATEGORY cannot be tested because it is a
      string array.  */
   /* Expected difference.  */
-  TEST_COMPARE_STRING (str (_NL_IDENTIFICATION_CODESET), "ANSI_X3.4-1968");
+  TEST_COMPARE_STRING (str (_NL_IDENTIFICATION_CODESET), "POSIX");
   TEST_COMPARE_STRING (str_utf8 (_NL_IDENTIFICATION_CODESET), "UTF-8");
 }
 
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 4c15b97683..291f502878 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -359,6 +359,7 @@ $(objpfx)test-vfprintf.out: $(gen-locales)
 $(objpfx)tst-grouping.out: $(gen-locales)
 $(objpfx)tst-grouping2.out: $(gen-locales)
 $(objpfx)tst-grouping_iterator.out: $(gen-locales)
+$(objpfx)tst-printf-bz25691-mem.out: $(gen-locales)
 $(objpfx)tst-sprintf.out: $(gen-locales)
 $(objpfx)tst-sscanf.out: $(gen-locales)
 $(objpfx)tst-swprintf.out: $(gen-locales)
diff --git a/stdio-common/tst-printf-bz25691.c b/stdio-common/tst-printf-bz25691.c
index e6fa2433fa..f953e0d956 100644
--- a/stdio-common/tst-printf-bz25691.c
+++ b/stdio-common/tst-printf-bz25691.c
@@ -30,6 +30,8 @@
 static int
 do_test (void)
 {
+  setlocale(LC_CTYPE, "C.UTF-8");
+
   mtrace ();
 
   /* For 's' conversion specifier with 'l' modifier the array must be
diff --git a/wcsmbs/wcsmbsload.c b/wcsmbs/wcsmbsload.c
index 7b338b6775..86666e8231 100644
--- a/wcsmbs/wcsmbsload.c
+++ b/wcsmbs/wcsmbsload.c
@@ -33,10 +33,10 @@ static const struct __gconv_step to_wc =
   .__shlib_handle = NULL,
   .__modname = NULL,
   .__counter = INT_MAX,
-  .__from_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
+  .__from_name = (char *) "POSIX",
   .__to_name = (char *) "INTERNAL",
-  .__fct = __gconv_transform_ascii_internal,
-  .__btowc_fct = __gconv_btwoc_ascii,
+  .__fct = __gconv_transform_posix_internal,
+  .__btowc_fct = __gconv_btwoc_posix,
   .__init_fct = NULL,
   .__end_fct = NULL,
   .__min_needed_from = 1,
@@ -53,8 +53,8 @@ static const struct __gconv_step to_mb =
   .__modname = NULL,
   .__counter = INT_MAX,
   .__from_name = (char *) "INTERNAL",
-  .__to_name = (char *) "ANSI_X3.4-1968//TRANSLIT",
-  .__fct = __gconv_transform_internal_ascii,
+  .__to_name = (char *) "POSIX",
+  .__fct = __gconv_transform_internal_posix,
   .__btowc_fct = NULL,
   .__init_fct = NULL,
   .__end_fct = NULL,
@@ -67,7 +67,9 @@ static const struct __gconv_step to_mb =
 };
 
 
-/* For the default locale we only have to handle ANSI_X3.4-1968.  */
+/* The default/"POSIX"/"C" locale is an 8-bit-clean mapping
+   with ANSI_X3.4-1968 in the first 128 characters;
+   we lift the remaining bytes by <UDC00>.  */
 const struct gconv_fcts __wcsmbs_gconv_fcts_c =
 {
   .towc = (struct __gconv_step *) &to_wc,
-- 
2.30.2

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2023-05-29 13:54 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-08-30 18:19 [PATCH] POSIX locale covers every byte [BZ# 29511] наб
2022-09-06 14:06 ` [PATCH v2] " наб
2022-09-06 14:19 ` [PATCH] " Florian Weimer
2022-09-06 18:06   ` наб
2022-09-06 18:10     ` [PATCH v3 1/2] iconvdata/tst-table-charmap.sh: remove handling of old, borrowed format наб
2022-09-14  2:39       ` [PATCH v4 " наб
2022-09-21 14:01         ` [PATCH v5 " наб
2022-11-02 17:17           ` [PATCH v6 " наб
2022-11-09 12:49             ` Florian Weimer
2022-11-02 17:17           ` [PATCH v6 2/2] POSIX locale covers every byte [BZ# 29511] наб
2022-11-09 14:20             ` Florian Weimer
2022-11-09 16:14               ` [PATCH v7] " наб
2022-11-10  9:52                 ` Florian Weimer
2023-01-09 15:17                   ` [PATCH v8] " наб
2023-02-07 14:16                     ` [PATCH v9] " наб
2023-02-13 14:52                       ` Florian Weimer
2023-04-26 18:54                         ` наб
2023-04-26 21:27                           ` Florian Weimer
2023-04-27  0:17                             ` [PATCH v10] " наб
2023-04-28 15:43                               ` [PATCH v11] " наб
2023-05-07 22:53                                 ` [PATCH v12] " наб
2023-05-29 13:54                                   ` [PATCH v13] " наб
2022-11-10  8:10               ` [PATCH v6 2/2] " Florian Weimer
2022-11-28 16:24                 ` наб
2022-12-02 17:36                   ` Florian Weimer
2022-12-02 18:42                     ` наб
2022-09-21 14:01         ` [PATCH v5 " наб
2022-09-14  2:39       ` [PATCH v4 " наб
2022-09-06 18:11     ` [PATCH v3 " наб

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).