From: Max Gautier <mg@max.gautier.name>
To: libc-alpha@sourceware.org
Cc: Max Gautier <mg@max.gautier.name>
Subject: [PATCH 3/5] Transform UTF-7 to MODIFIED-UTF-7
Date: Thu, 20 Aug 2020 01:07:00 +0200 [thread overview]
Message-ID: <20200819230702.229822-4-mg@max.gautier.name> (raw)
In-Reply-To: <20200819230702.229822-1-mg@max.gautier.name>
* shift character is '&' instead of '+'
* No "optionnal direct characters" set
* modified base64 character set
* use direct comparison instead of arrays and bitwise op
---
Regarding the fourth item, if there is reasons to use the bitwise way,
please let me know.
iconvdata/modified-utf-7.c | 97 ++++++++++++--------------------------
1 file changed, 31 insertions(+), 66 deletions(-)
diff --git a/iconvdata/modified-utf-7.c b/iconvdata/modified-utf-7.c
index fc6a8dfcfd..e6eb784891 100644
--- a/iconvdata/modified-utf-7.c
+++ b/iconvdata/modified-utf-7.c
@@ -1,4 +1,4 @@
-/* Conversion module for UTF-7.
+/* Conversion module for Modified UTF-7.
Copyright (C) 2000-2020 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,12 +16,12 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-/* UTF-7 is a legacy encoding used for transmitting Unicode within the
- ASCII character set, used primarily by mail agents. New programs
- are encouraged to use UTF-8 instead.
+/* Modified UTF-7 is a legacy encoding used for transmitting Unicode within the
+ ASCII character set, used primarily by IMAP server and clients agents.
+ New programs are encouraged to use UTF-8 instead.
- UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642). The
- original Base64 encoding is defined in RFC 2045. */
+ Modified UTF-7 is specified in RFC 3501 as part of the IMAPv4 specification.
+ The original Base64 encoding is defined in RFC 2045. */
#include <dlfcn.h>
#include <gconv.h>
@@ -29,64 +29,29 @@
#include <stdlib.h>
-/* Define this to 1 if you want the so-called "optional direct" characters
- ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
- to be encoded. Define to 0 if you want them to be passed straight
- through, like the so-called "direct" characters.
- We set this to 1 because it's safer.
- */
-#define UTF7_ENCODE_OPTIONAL_CHARS 1
-
-
/* The set of "direct characters":
A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
+ ! " # $ % + * ; < = > @ [ ] ^ _ ` { | }
*/
-static const unsigned char direct_tab[128 / 8] =
- {
- 0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
- 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
- };
-
static int
isdirect (uint32_t ch)
{
- return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
-}
-
-
-/* The set of "direct and optional direct characters":
- A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
- ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
-*/
-
-static const unsigned char xdirect_tab[128 / 8] =
- {
- 0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
- 0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
- };
-
-static int
-isxdirect (uint32_t ch)
-{
- return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
+ return ((ch == '\n' || ch == '\t' || ch == '\r')
+ || (ch >= 0x20 && ch <= 0x7e && ch != '&'));
}
-
-/* The set of "extended base64 characters":
- A-Z a-z 0-9 + / -
+/* The set of "modified base64 characters":
+ A-Z a-z 0-9 + , -
*/
-static const unsigned char xbase64_tab[128 / 8] =
- {
- 0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
- 0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
- };
-
static int
-isxbase64 (uint32_t ch)
+ismbase64 (uint32_t ch)
{
- return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
+ return ((ch >= 'a' && ch <= 'z')
+ || (ch >= 'A' && ch <= 'Z')
+ || (ch >= '0' && ch <= '9')
+ || (ch == '+' || ch == ','));
}
@@ -103,18 +68,18 @@ base64 (unsigned int i)
else if (i == 62)
return '+';
else if (i == 63)
- return '/';
+ return ',';
else
abort ();
}
/* Definitions used in the body of the `gconv' function. */
-#define CHARSET_NAME "UTF-7//"
+#define CHARSET_NAME "MODIFIED-UTF-7//"
#define DEFINE_INIT 1
#define DEFINE_FINI 1
-#define FROM_LOOP from_utf7_loop
-#define TO_LOOP to_utf7_loop
+#define FROM_LOOP from_m_utf7_loop
+#define TO_LOOP to_m_utf7_loop
#define MIN_NEEDED_FROM 1
#define MAX_NEEDED_FROM 6
#define MIN_NEEDED_TO 4
@@ -161,13 +126,13 @@ base64 (unsigned int i)
if ((statep->__count >> 3) == 0) \
{ \
/* base64 encoding inactive. */ \
- if (isxdirect (ch)) \
+ if (isdirect (ch)) \
{ \
inptr++; \
put32 (outptr, ch); \
outptr += 4; \
} \
- else if (__glibc_likely (ch == '+')) \
+ else if (__glibc_likely (ch == '&')) \
{ \
if (__glibc_unlikely (inptr + 2 > inend)) \
{ \
@@ -209,7 +174,7 @@ base64 (unsigned int i)
i = ch - '0' + 52; \
else if (ch == '+') \
i = 62; \
- else if (ch == '/') \
+ else if (ch == ',') \
i = 63; \
else \
{ \
@@ -323,7 +288,7 @@ base64 (unsigned int i)
if ((statep->__count & 0x18) == 0) \
{ \
/* base64 encoding inactive */ \
- if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \
+ if (isdirect (ch)) \
{ \
*outptr++ = (unsigned char) ch; \
} \
@@ -331,7 +296,7 @@ base64 (unsigned int i)
{ \
size_t count; \
\
- if (ch == '+') \
+ if (ch == '&') \
count = 2; \
else if (ch < 0x10000) \
count = 3; \
@@ -346,8 +311,8 @@ base64 (unsigned int i)
break; \
} \
\
- *outptr++ = '+'; \
- if (ch == '+') \
+ *outptr++ = '&'; \
+ if (ch == '&') \
*outptr++ = '-'; \
else if (ch < 0x10000) \
{ \
@@ -375,12 +340,12 @@ base64 (unsigned int i)
else \
{ \
/* base64 encoding active */ \
- if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch)) \
+ if (isdirect (ch)) \
{ \
/* deactivate base64 encoding */ \
size_t count; \
\
- count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1; \
+ count = ((statep->__count & 0x18) >= 0x10) + ismbase64 (ch) + 1; \
if (__glibc_unlikely (outptr + count > outend)) \
{ \
result = __GCONV_FULL_OUTPUT; \
@@ -389,7 +354,7 @@ base64 (unsigned int i)
\
if ((statep->__count & 0x18) >= 0x10) \
*outptr++ = base64 ((statep->__count >> 3) & ~3); \
- if (isxbase64 (ch)) \
+ if (ismbase64 (ch)) \
*outptr++ = '-'; \
*outptr++ = (unsigned char) ch; \
statep->__count = 0; \
@@ -499,7 +464,7 @@ base64 (unsigned int i)
memset (data->__statep, '\0', sizeof (mbstate_t)); \
else \
{ \
- /* The "to UTF-7" direction. Flush the remaining bits and terminate \
+ /* The "to M-UTF-7" direction. Flush the remaining bits and terminate \
with a '-' byte. This will guarantee correct decoding if more \
UTF-7 encoded text is added afterwards. */ \
int state = data->__statep->__count; \
--
2.28.0
next prev parent reply other threads:[~2020-08-19 23:05 UTC|newest]
Thread overview: 60+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-08-19 23:06 [PATCH 0/5] iconv: module for MODIFIED-UTF-7 Max Gautier
2020-08-19 23:06 ` [PATCH 1/5] Copy utf-7 module to modified-utf-7 Max Gautier
2020-08-19 23:06 ` [PATCH 2/5] Update gconv-modules file Max Gautier
2020-08-19 23:07 ` Max Gautier [this message]
2020-08-19 23:07 ` [PATCH 4/5] Make terminating base64 sequences mandatory Max Gautier
2020-08-19 23:07 ` [PATCH 5/5] Add test case for MODIFIED-UTF-7 Max Gautier
2020-08-20 7:18 ` Andreas Schwab
2020-08-20 15:40 ` [PATCH v2 " Max Gautier
2020-08-20 8:03 ` [PATCH 0/5] iconv: module " Florian Weimer
2020-08-20 15:19 ` Max Gautier
2020-08-20 15:58 ` Florian Weimer
2020-09-02 15:24 ` Max Gautier
2020-09-02 20:01 ` Adhemerval Zanella
2020-09-03 9:47 ` Max Gautier
2020-09-03 10:56 ` Andreas Schwab
2021-01-25 9:02 ` [PATCH v3 0/5] iconv: module for IMAP-UTF-7 Max Gautier
2021-01-25 9:02 ` [PATCH v3 1/5] Copy utf-7 module to modified-utf-7 Max Gautier
2021-01-25 9:31 ` Andreas Schwab
2021-01-25 13:51 ` Max Gautier
2021-02-07 9:42 ` Florian Weimer
2021-02-07 12:29 ` Max Gautier
2021-02-07 12:34 ` Florian Weimer
2021-12-09 9:31 ` [PATCH v4 0/4] iconv: Add support for UTF-7-IMAP Max Gautier
2021-12-09 9:31 ` [PATCH v4 1/4] iconv: Always encode "optional direct" UTF-7 characters Max Gautier
2022-03-07 12:10 ` Adhemerval Zanella
2021-12-09 9:31 ` [PATCH v4 2/4] iconv: Better mapping to RFC for UTF-7 Max Gautier
2022-03-07 12:14 ` Adhemerval Zanella
2022-03-20 16:41 ` [PATCH v5 " Max Gautier
2022-03-21 11:53 ` Adhemerval Zanella
2022-03-21 11:59 ` Adhemerval Zanella
2022-03-21 12:06 ` Adhemerval Zanella
2022-03-21 14:07 ` Max Gautier
2021-12-09 9:31 ` [PATCH v4 3/4] iconv: make utf-7.c able to use variants Max Gautier
2022-03-07 12:34 ` Adhemerval Zanella
2022-03-12 11:07 ` Max Gautier
2022-03-14 12:17 ` Adhemerval Zanella
2022-03-20 16:42 ` [PATCH v5 " Max Gautier
2022-03-21 12:24 ` Adhemerval Zanella
2021-12-09 9:31 ` [PATCH v4 4/4] iconv: Add UTF-7-IMAP variant in utf-7.c Max Gautier
2022-03-07 12:46 ` Adhemerval Zanella
2022-03-20 16:43 ` [PATCH v5 " Max Gautier
2022-03-21 12:24 ` Adhemerval Zanella
2021-12-17 13:15 ` [PATCH v4 0/4] iconv: Add support for UTF-7-IMAP Max Gautier
2022-01-24 14:19 ` Adhemerval Zanella
2022-02-10 13:16 ` Max Gautier
2022-02-10 13:17 ` Adhemerval Zanella
2022-03-04 8:53 ` Max Gautier
2022-01-17 14:07 ` Max Gautier
2022-01-24 9:17 ` Max Gautier
2021-01-25 9:02 ` [PATCH v3 2/5] Update gconv-modules file Max Gautier
2021-02-07 9:49 ` Florian Weimer
2021-01-25 9:02 ` [PATCH v3 3/5] Transform UTF-7 to IMAP-UTF-7 Max Gautier
2021-01-25 9:02 ` [PATCH v3 4/5] Make terminating base64 sequences mandatory Max Gautier
2021-02-07 9:45 ` Florian Weimer
2021-01-25 9:02 ` [PATCH v3 5/5] Add test case for IMAP-UTF-7 Max Gautier
2021-02-07 9:49 ` Florian Weimer
2021-03-16 14:39 ` [PATCH v3 5/5][pw utf test] " Siddhesh Poyarekar
2022-03-21 12:28 ` [PATCH v3 0/5] iconv: module " Adhemerval Zanella
2022-03-21 14:09 ` Max Gautier
2021-01-12 9:12 ` [PATCH 0/5] iconv: module for MODIFIED-UTF-7 Florian Weimer
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200819230702.229822-4-mg@max.gautier.name \
--to=mg@max.gautier.name \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).