From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 49451 invoked by alias); 26 Jul 2018 14:51:02 -0000 Mailing-List: contact libc-alpha-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: libc-alpha-owner@sourceware.org Received: (qmail 49439 invoked by uid 89); 26 Jul 2018 14:51:01 -0000 Authentication-Results: sourceware.org; auth=none X-Spam-SWARE-Status: No, score=-2.3 required=5.0 tests=AWL,BAYES_00,SPF_HELO_PASS autolearn=ham version=3.3.2 spammy=lz, LZ, l9, L9 X-HELO: mx1.redhat.com Subject: Re: [PATCHv4a] Expected behaviour for a-z, A-Z, and 0-9 (Bug 23393). To: libc-alpha@sourceware.org References: <9d6f47ec-f9eb-ead0-889c-3b9aae66551c@redhat.com> <5bcef059-b928-d2e9-82dd-2ae68be96020@redhat.com> <541d18da-6318-382e-d5cd-6c69a5db1a07@redhat.com> <8359bdf2-457e-e2f1-ac90-e4b27b2e0495@redhat.com> <94e7e1b2-6bfe-617f-2060-160631f82f80@redhat.com> From: Florian Weimer Message-ID: <7630a77d-ae62-d500-aa36-dc5e54ff38b5@redhat.com> Date: Thu, 26 Jul 2018 14:51:00 -0000 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.8.0 MIME-Version: 1.0 In-Reply-To: Content-Type: multipart/mixed; boundary="------------DF22FF72530EDAE564BFED98" X-SW-Source: 2018-07/txt/msg00900.txt.bz2 This is a multi-part message in MIME format. --------------DF22FF72530EDAE564BFED98 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 7bit Content-length: 3967 On 07/26/2018 04:34 AM, Carlos O'Donell wrote: > On 07/25/2018 04:57 PM, Carlos O'Donell wrote: >> v4 >> - Fixed ar_SA, km_KH, lo_LA, or_IN, sl_SI, th_TH. >> - Added range checking for a-z, A-Z for all supported UTF-8 locales. >> >> All of my testers are clean. > > Attaching v4 on top of the current master. > > This fixes all the locales. I wrote another enumeration tester, this time covering all locales. It found these issues: az_AZ: U+000069 fails to match /[a-z]/ az_AZ: U+000049 fails to match /[A-Z]/ az_AZ.utf8: U+000069 fails to match /[a-z]/ az_AZ.utf8: U+000049 fails to match /[A-Z]/ crh_UA: U+000069 fails to match /[a-z]/ crh_UA: U+000049 fails to match /[A-Z]/ crh_UA.utf8: U+000069 fails to match /[a-z]/ crh_UA.utf8: U+000049 fails to match /[A-Z]/ ku_TR: U+000069 fails to match /[a-z]/ ku_TR: U+000049 fails to match /[A-Z]/ ku_TR.iso88599: U+000069 fails to match /[a-z]/ ku_TR.iso88599: U+000049 fails to match /[A-Z]/ ku_TR.utf8: U+000069 fails to match /[a-z]/ ku_TR.utf8: U+000049 fails to match /[A-Z]/ lv_LV: U+000079 fails to match /[a-z]/ lv_LV: U+000059 fails to match /[A-Z]/ lv_LV.iso885913: U+000079 fails to match /[a-z]/ lv_LV.iso885913: U+000059 fails to match /[A-Z]/ lv_LV.utf8: U+000079 fails to match /[a-z]/ lv_LV.utf8: U+000059 fails to match /[A-Z]/ shs_CA: U+0000E6 matches /[a-z]/ unexpectedly shs_CA: U+0000C6 matches /[A-Z]/ unexpectedly shs_CA.utf8: U+0000E6 matches /[a-z]/ unexpectedly shs_CA.utf8: U+0000C6 matches /[A-Z]/ unexpectedly slovene: U+00006A fails to match /[a-z]/ slovene: U+00006B fails to match /[a-z]/ slovene: U+00006C fails to match /[a-z]/ slovene: U+00006D fails to match /[a-z]/ slovene: U+00006E fails to match /[a-z]/ slovene: U+00006F fails to match /[a-z]/ slovenian: U+00006A fails to match /[a-z]/ slovenian: U+00006B fails to match /[a-z]/ slovenian: U+00006C fails to match /[a-z]/ slovenian: U+00006D fails to match /[a-z]/ slovenian: U+00006E fails to match /[a-z]/ slovenian: U+00006F fails to match /[a-z]/ sl_SI: U+00006A fails to match /[a-z]/ sl_SI: U+00006B fails to match /[a-z]/ sl_SI: U+00006C fails to match /[a-z]/ sl_SI: U+00006D fails to match /[a-z]/ sl_SI: U+00006E fails to match /[a-z]/ sl_SI: U+00006F fails to match /[a-z]/ sl_SI.iso88592: U+00006A fails to match /[a-z]/ sl_SI.iso88592: U+00006B fails to match /[a-z]/ sl_SI.iso88592: U+00006C fails to match /[a-z]/ sl_SI.iso88592: U+00006D fails to match /[a-z]/ sl_SI.iso88592: U+00006E fails to match /[a-z]/ sl_SI.iso88592: U+00006F fails to match /[a-z]/ sl_SI.utf8: U+00006A fails to match /[a-z]/ sl_SI.utf8: U+00006B fails to match /[a-z]/ sl_SI.utf8: U+00006C fails to match /[a-z]/ sl_SI.utf8: U+00006D fails to match /[a-z]/ sl_SI.utf8: U+00006E fails to match /[a-z]/ sl_SI.utf8: U+00006F fails to match /[a-z]/ sv_FI: U+000077 fails to match /[a-z]/ sv_FI: U+000057 fails to match /[A-Z]/ sv_FI@euro: U+000077 fails to match /[a-z]/ sv_FI@euro: U+000057 fails to match /[A-Z]/ sv_FI.iso88591: U+000077 fails to match /[a-z]/ sv_FI.iso88591: U+000057 fails to match /[A-Z]/ sv_FI.iso885915@euro: U+000077 fails to match /[a-z]/ sv_FI.iso885915@euro: U+000057 fails to match /[A-Z]/ sv_FI.utf8: U+000077 fails to match /[a-z]/ sv_FI.utf8: U+000057 fails to match /[A-Z]/ sv_SE: U+000077 fails to match /[a-z]/ sv_SE: U+000057 fails to match /[A-Z]/ sv_SE.iso88591: U+000077 fails to match /[a-z]/ sv_SE.iso88591: U+000057 fails to match /[A-Z]/ sv_SE.utf8: U+000077 fails to match /[a-z]/ sv_SE.utf8: U+000057 fails to match /[A-Z]/ swedish: U+000077 fails to match /[a-z]/ swedish: U+000057 fails to match /[A-Z]/ tt_RU: U+000069 fails to match /[a-z]/ tt_RU: U+000049 fails to match /[A-Z]/ tt_RU@iqtelif: U+000069 fails to match /[a-z]/ tt_RU@iqtelif: U+000049 fails to match /[A-Z]/ tt_RU.utf8: U+000069 fails to match /[a-z]/ tt_RU.utf8: U+000049 fails to match /[A-Z]/ tt_RU.utf8@iqtelif: U+000069 fails to match /[a-z]/ tt_RU.utf8@iqtelif: U+000049 fails to match /[A-Z]/ Thanks, Florian --------------DF22FF72530EDAE564BFED98 Content-Type: text/x-c++src; name="rational-ranges-1.cc" Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="rational-ranges-1.cc" Content-length: 2944 #include #include #include #include #include #include #include #include #include #include #include static std::vector get_locales() { FILE *fp = popen("locale -a", "r"); if (fp == NULL) err(1, "running locale -a"); std::vector result; while (!feof(fp)) { char *elem{}; int ret = fscanf(fp, "%ms", &elem); if (ret == 1) { if (elem == nullptr) errx(1, "invalid fscanf result"); result.emplace_back(elem); free(elem); } else if (ferror(fp)) err(1, "fscanf failed"); } int ret = pclose(fp); if (ret != 0) err(1, "locale -a failed with status %d", ret); std::sort(result.begin(), result.end()); return result; } static void test_regexp_range(const char *locale, const char *pattern, std::pair range) { regex_t reg; { int ret = regcomp(®, pattern, REG_EXTENDED | REG_NOSUB); if (ret != 0) errx(1, "Cannot compile regular expression /%s/: %d", pattern, ret); } const wchar_t maximum_character = 0x10FFFF; const unsigned maximum_length = 5; /* With NUL. */ for (wchar_t ch = 1; ch <= maximum_character; ++ch) { char uch[MB_LEN_MAX]; mbstate_t ps{}; { size_t ret = wcrtomb(uch, ch, &ps); if (ret == static_cast(-1)) { if (errno == EILSEQ) continue; err(1, "wcrtomb(0x%x)", static_cast(ch)); } else if (ret == 0) continue; // Some anomaly. if (ret >= maximum_length) errx(1, "multi-byte length %zu at 0x%x exceeds %u", ret, ch, maximum_length); uch[ret] = '\0'; } int ret = regexec(®, uch, 0, NULL, 0); if (ret != 0 && ret != REG_NOMATCH) errx(1, "regexec of /%s/ failed with code %d", pattern, ret); bool regex_matches = ret == 0; bool range_matches = range.first <= ch && ch <= range.second; if (regex_matches != range_matches) { if (regex_matches) printf("%s: U+%06X matches /%s/ unexpectedly\n", locale, static_cast(ch), pattern); else printf("%s: U+%06X fails to match /%s/\n", locale, static_cast(ch), pattern); } } regfree(®); } int main() { std::vector locales{get_locales()}; for (const auto &locale : locales) { if (setlocale(LC_ALL, locale.c_str()) == NULL) err(1, "Cannot set locale to %s", locale.c_str()); test_regexp_range(locale.c_str(), "[0-9]", std::make_pair(L'0', L'9')); test_regexp_range(locale.c_str(), "[a-z]", std::make_pair(L'a', L'z')); test_regexp_range(locale.c_str(), "[A-Z]", std::make_pair(L'A', L'Z')); } } --------------DF22FF72530EDAE564BFED98--