public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Max Gautier <mg@max.gautier.name>
To: libc-alpha@sourceware.org
Cc: Max Gautier <mg@max.gautier.name>
Subject: [PATCH 3/5] Transform UTF-7 to MODIFIED-UTF-7
Date: Thu, 20 Aug 2020 01:07:00 +0200	[thread overview]
Message-ID: <20200819230702.229822-4-mg@max.gautier.name> (raw)
In-Reply-To: <20200819230702.229822-1-mg@max.gautier.name>

* shift character is '&' instead of '+'
* No "optionnal direct characters" set
* modified base64 character set
* use direct comparison instead of arrays and bitwise op 
---
Regarding the fourth item, if there is reasons to use the bitwise way,
please let me know.
 iconvdata/modified-utf-7.c | 97 ++++++++++++--------------------------
 1 file changed, 31 insertions(+), 66 deletions(-)

diff --git a/iconvdata/modified-utf-7.c b/iconvdata/modified-utf-7.c
index fc6a8dfcfd..e6eb784891 100644
--- a/iconvdata/modified-utf-7.c
+++ b/iconvdata/modified-utf-7.c
@@ -1,4 +1,4 @@
-/* Conversion module for UTF-7.
+/* Conversion module for Modified UTF-7.
    Copyright (C) 2000-2020 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,12 +16,12 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-/* UTF-7 is a legacy encoding used for transmitting Unicode within the
-   ASCII character set, used primarily by mail agents.  New programs
-   are encouraged to use UTF-8 instead.
+/* Modified UTF-7 is a legacy encoding used for transmitting Unicode within the
+   ASCII character set, used primarily by IMAP server and clients agents.
+   New programs are encouraged to use UTF-8 instead.
 
-   UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642).  The
-   original Base64 encoding is defined in RFC 2045.  */
+   Modified UTF-7 is specified in RFC 3501 as part of the IMAPv4 specification.
+   The original Base64 encoding is defined in RFC 2045.  */
 
 #include <dlfcn.h>
 #include <gconv.h>
@@ -29,64 +29,29 @@
 #include <stdlib.h>
 
 
-/* Define this to 1 if you want the so-called "optional direct" characters
-      ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
-   to be encoded. Define to 0 if you want them to be passed straight
-   through, like the so-called "direct" characters.
-   We set this to 1 because it's safer.
- */
-#define UTF7_ENCODE_OPTIONAL_CHARS 1
-
-
 /* The set of "direct characters":
    A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
+   ! " # $ % + * ; < = > @ [ ] ^ _ ` { | }
 */
 
-static const unsigned char direct_tab[128 / 8] =
-  {
-    0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
-    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
-  };
-
 static int
 isdirect (uint32_t ch)
 {
-  return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
-}
-
-
-/* The set of "direct and optional direct characters":
-   A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
-   ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
-*/
-
-static const unsigned char xdirect_tab[128 / 8] =
-  {
-    0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
-    0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
-  };
-
-static int
-isxdirect (uint32_t ch)
-{
-  return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
+  return ((ch == '\n' || ch == '\t' || ch == '\r')
+		  || (ch >= 0x20 && ch <= 0x7e && ch != '&'));
 }
 
-
-/* The set of "extended base64 characters":
-   A-Z a-z 0-9 + / -
+/* The set of "modified base64 characters":
+   A-Z a-z 0-9 + , -
 */
 
-static const unsigned char xbase64_tab[128 / 8] =
-  {
-    0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
-    0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
-  };
-
 static int
-isxbase64 (uint32_t ch)
+ismbase64 (uint32_t ch)
 {
-  return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
+  return ((ch >= 'a' && ch <= 'z')
+			  || (ch >= 'A' && ch <= 'Z')
+			  || (ch >= '0' && ch <= '9')
+			  || (ch == '+' || ch == ','));
 }
 
 
@@ -103,18 +68,18 @@ base64 (unsigned int i)
   else if (i == 62)
     return '+';
   else if (i == 63)
-    return '/';
+    return ',';
   else
     abort ();
 }
 
 
 /* Definitions used in the body of the `gconv' function.  */
-#define CHARSET_NAME		"UTF-7//"
+#define CHARSET_NAME		"MODIFIED-UTF-7//"
 #define DEFINE_INIT		1
 #define DEFINE_FINI		1
-#define FROM_LOOP		from_utf7_loop
-#define TO_LOOP			to_utf7_loop
+#define FROM_LOOP		from_m_utf7_loop
+#define TO_LOOP			to_m_utf7_loop
 #define MIN_NEEDED_FROM		1
 #define MAX_NEEDED_FROM		6
 #define MIN_NEEDED_TO		4
@@ -161,13 +126,13 @@ base64 (unsigned int i)
     if ((statep->__count >> 3) == 0)					      \
       {									      \
 	/* base64 encoding inactive.  */				      \
-	if (isxdirect (ch))						      \
+	if (isdirect (ch))						      \
 	  {								      \
 	    inptr++;							      \
 	    put32 (outptr, ch);						      \
 	    outptr += 4;						      \
 	  }								      \
-	else if (__glibc_likely (ch == '+'))				      \
+	else if (__glibc_likely (ch == '&'))				      \
 	  {								      \
 	    if (__glibc_unlikely (inptr + 2 > inend))			      \
 	      {								      \
@@ -209,7 +174,7 @@ base64 (unsigned int i)
 	  i = ch - '0' + 52;						      \
 	else if (ch == '+')						      \
 	  i = 62;							      \
-	else if (ch == '/')						      \
+	else if (ch == ',')						      \
 	  i = 63;							      \
 	else								      \
 	  {								      \
@@ -323,7 +288,7 @@ base64 (unsigned int i)
     if ((statep->__count & 0x18) == 0)					      \
       {									      \
 	/* base64 encoding inactive */					      \
-	if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch))      \
+	if (isdirect (ch))      \
 	  {								      \
 	    *outptr++ = (unsigned char) ch;				      \
 	  }								      \
@@ -331,7 +296,7 @@ base64 (unsigned int i)
 	  {								      \
 	    size_t count;						      \
 									      \
-	    if (ch == '+')						      \
+	    if (ch == '&')						      \
 	      count = 2;						      \
 	    else if (ch < 0x10000)					      \
 	      count = 3;						      \
@@ -346,8 +311,8 @@ base64 (unsigned int i)
 		break;							      \
 	      }								      \
 									      \
-	    *outptr++ = '+';						      \
-	    if (ch == '+')						      \
+	    *outptr++ = '&';						      \
+	    if (ch == '&')						      \
 	      *outptr++ = '-';						      \
 	    else if (ch < 0x10000)					      \
 	      {								      \
@@ -375,12 +340,12 @@ base64 (unsigned int i)
     else								      \
       {									      \
 	/* base64 encoding active */					      \
-	if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch))      \
+	if (isdirect (ch))      \
 	  {								      \
 	    /* deactivate base64 encoding */				      \
 	    size_t count;						      \
 									      \
-	    count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1;  \
+	    count = ((statep->__count & 0x18) >= 0x10) + ismbase64 (ch) + 1;  \
 	    if (__glibc_unlikely (outptr + count > outend))		      \
 	      {								      \
 		result = __GCONV_FULL_OUTPUT;				      \
@@ -389,7 +354,7 @@ base64 (unsigned int i)
 									      \
 	    if ((statep->__count & 0x18) >= 0x10)			      \
 	      *outptr++ = base64 ((statep->__count >> 3) & ~3);		      \
-	    if (isxbase64 (ch))						      \
+	    if (ismbase64 (ch))						      \
 	      *outptr++ = '-';						      \
 	    *outptr++ = (unsigned char) ch;				      \
 	    statep->__count = 0;					      \
@@ -499,7 +464,7 @@ base64 (unsigned int i)
     memset (data->__statep, '\0', sizeof (mbstate_t));			      \
   else									      \
     {									      \
-      /* The "to UTF-7" direction.  Flush the remaining bits and terminate    \
+      /* The "to M-UTF-7" direction.  Flush the remaining bits and terminate    \
 	 with a '-' byte.  This will guarantee correct decoding if more	      \
 	 UTF-7 encoded text is added afterwards.  */			      \
       int state = data->__statep->__count;				      \
-- 
2.28.0


  parent reply	other threads:[~2020-08-19 23:05 UTC|newest]

Thread overview: 60+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-08-19 23:06 [PATCH 0/5] iconv: module for MODIFIED-UTF-7 Max Gautier
2020-08-19 23:06 ` [PATCH 1/5] Copy utf-7 module to modified-utf-7 Max Gautier
2020-08-19 23:06 ` [PATCH 2/5] Update gconv-modules file Max Gautier
2020-08-19 23:07 ` Max Gautier [this message]
2020-08-19 23:07 ` [PATCH 4/5] Make terminating base64 sequences mandatory Max Gautier
2020-08-19 23:07 ` [PATCH 5/5] Add test case for MODIFIED-UTF-7 Max Gautier
2020-08-20  7:18   ` Andreas Schwab
2020-08-20 15:40     ` [PATCH v2 " Max Gautier
2020-08-20  8:03 ` [PATCH 0/5] iconv: module " Florian Weimer
2020-08-20 15:19   ` Max Gautier
2020-08-20 15:58     ` Florian Weimer
2020-09-02 15:24   ` Max Gautier
2020-09-02 20:01     ` Adhemerval Zanella
2020-09-03  9:47       ` Max Gautier
2020-09-03 10:56         ` Andreas Schwab
2021-01-25  9:02   ` [PATCH v3 0/5] iconv: module for IMAP-UTF-7 Max Gautier
2021-01-25  9:02     ` [PATCH v3 1/5] Copy utf-7 module to modified-utf-7 Max Gautier
2021-01-25  9:31       ` Andreas Schwab
2021-01-25 13:51         ` Max Gautier
2021-02-07  9:42           ` Florian Weimer
2021-02-07 12:29             ` Max Gautier
2021-02-07 12:34               ` Florian Weimer
2021-12-09  9:31             ` [PATCH v4 0/4] iconv: Add support for UTF-7-IMAP Max Gautier
2021-12-09  9:31               ` [PATCH v4 1/4] iconv: Always encode "optional direct" UTF-7 characters Max Gautier
2022-03-07 12:10                 ` Adhemerval Zanella
2021-12-09  9:31               ` [PATCH v4 2/4] iconv: Better mapping to RFC for UTF-7 Max Gautier
2022-03-07 12:14                 ` Adhemerval Zanella
2022-03-20 16:41                 ` [PATCH v5 " Max Gautier
2022-03-21 11:53                   ` Adhemerval Zanella
2022-03-21 11:59                     ` Adhemerval Zanella
2022-03-21 12:06                       ` Adhemerval Zanella
2022-03-21 14:07                       ` Max Gautier
2021-12-09  9:31               ` [PATCH v4 3/4] iconv: make utf-7.c able to use variants Max Gautier
2022-03-07 12:34                 ` Adhemerval Zanella
2022-03-12 11:07                   ` Max Gautier
2022-03-14 12:17                     ` Adhemerval Zanella
2022-03-20 16:42                 ` [PATCH v5 " Max Gautier
2022-03-21 12:24                   ` Adhemerval Zanella
2021-12-09  9:31               ` [PATCH v4 4/4] iconv: Add UTF-7-IMAP variant in utf-7.c Max Gautier
2022-03-07 12:46                 ` Adhemerval Zanella
2022-03-20 16:43                 ` [PATCH v5 " Max Gautier
2022-03-21 12:24                   ` Adhemerval Zanella
2021-12-17 13:15               ` [PATCH v4 0/4] iconv: Add support for UTF-7-IMAP Max Gautier
2022-01-24 14:19                 ` Adhemerval Zanella
2022-02-10 13:16                   ` Max Gautier
2022-02-10 13:17                     ` Adhemerval Zanella
2022-03-04  8:53                       ` Max Gautier
2022-01-17 14:07               ` Max Gautier
2022-01-24  9:17               ` Max Gautier
2021-01-25  9:02     ` [PATCH v3 2/5] Update gconv-modules file Max Gautier
2021-02-07  9:49       ` Florian Weimer
2021-01-25  9:02     ` [PATCH v3 3/5] Transform UTF-7 to IMAP-UTF-7 Max Gautier
2021-01-25  9:02     ` [PATCH v3 4/5] Make terminating base64 sequences mandatory Max Gautier
2021-02-07  9:45       ` Florian Weimer
2021-01-25  9:02     ` [PATCH v3 5/5] Add test case for IMAP-UTF-7 Max Gautier
2021-02-07  9:49       ` Florian Weimer
2021-03-16 14:39     ` [PATCH v3 5/5][pw utf test] " Siddhesh Poyarekar
2022-03-21 12:28     ` [PATCH v3 0/5] iconv: module " Adhemerval Zanella
2022-03-21 14:09       ` Max Gautier
2021-01-12  9:12 ` [PATCH 0/5] iconv: module for MODIFIED-UTF-7 Florian Weimer

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200819230702.229822-4-mg@max.gautier.name \
    --to=mg@max.gautier.name \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).