public inbox for cygwin-cvs@sourceware.org help / color / mirror / Atom feed
From: Corinna Vinschen <corinna@sourceware.org> To: cygwin-cvs@sourceware.org Subject: [newlib-cygwin/main] Cygwin: glob: implement collating symbol support Date: Mon, 20 Feb 2023 22:01:14 +0000 (GMT) [thread overview] Message-ID: <20230220220114.1ED703858C3A@sourceware.org> (raw) https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=ce5aa098071304cfd3bd1bd535a7571089344b1a commit ce5aa098071304cfd3bd1bd535a7571089344b1a Author: Corinna Vinschen <corinna@vinschen.de> AuthorDate: Mon Feb 20 22:50:17 2023 +0100 Commit: Corinna Vinschen <corinna@vinschen.de> CommitDate: Mon Feb 20 22:50:17 2023 +0100 Cygwin: glob: implement collating symbol support Allow the [.<sym>.] expression This requires a string comparision rather than a character comparison. Introduce and use __wscollate_range_cmp. Signed-off-by: Corinna Vinschen <corinna@vinschen.de> Diff: --- winsup/cygwin/glob.cc | 115 ++++++++++++++++++++++----------- winsup/cygwin/local_includes/collate.h | 1 + winsup/cygwin/nlsfuncs.cc | 19 ++++++ 3 files changed, 98 insertions(+), 37 deletions(-) diff --git a/winsup/cygwin/glob.cc b/winsup/cygwin/glob.cc index 4ef947929a58..66681786aae8 100644 --- a/winsup/cygwin/glob.cc +++ b/winsup/cygwin/glob.cc @@ -160,6 +160,9 @@ typedef char Char; #define M_SET META('[') #define M_NAMED META(':') #define M_EQUIV META('=') +#define M_COLL(_ccnt) META('.' | ((_ccnt) << 8)) +#define M_COLL_P(_c) (((_c) & M_COLL_MASK) == META('.')) +#define M_COLL_CNT(_c) (((_c) & ~M_COLL_MASK) >> 8) #define ismeta(c) (((c)&M_QUOTE) != 0) static int compare(const void *, const void *); @@ -528,41 +531,61 @@ glob0(const Char *pattern, glob_t *pglob, size_t *limit) *bufnext++ = M_SET; if (c == NOT) *bufnext++ = M_NOT; - c = *qpatnext; + c = *qpatnext++; do { wint_t wclass[64]; Char ctype; - ctype = check_classes_expr(qpatnext, wclass, + ctype = check_classes_expr(--qpatnext, wclass, 64); - if (ctype) { + ++qpatnext; + if (ctype == COLON) { wctype_t type; - - if (ctype == COLON) { - char cclass[64]; - - /* No worries, char classes are - ASCII-only anyway */ - wcitoascii (cclass, wclass); - if ((type = wctype (cclass))) { - *bufnext++ = M_NAMED; - *bufnext++ = CHAR (type); - } - } else if (ctype == EQUALS && - wclass[0] && !wclass[1]) { + char cclass[64]; + + /* No worries, char classes are + ASCII-only anyway */ + wcitoascii (cclass, wclass); + if ((type = wctype (cclass))) { + *bufnext++ = M_NAMED; + *bufnext++ = CHAR (type); + } + continue; + } + if (ctype == EQUALS) { + if (wclass[0] && !wclass[1]) { *bufnext++ = M_EQUIV; *bufnext++ = CHAR (wclass[0]); } - /* TODO: [. is ignored yet */ - qpatnext++; continue; } - *bufnext++ = CHAR(c); + if (ctype == DOT && + is_unicode_coll_elem (wclass)) { + *bufnext++ = + M_COLL (wcilen (wclass)); + wint_t *wcp = wclass; + while ((*bufnext++ = *wcp++)) + ; + --bufnext; /* drop NUL */ + } else + *bufnext++ = CHAR(c); if (*qpatnext == RANGE && (c = qpatnext[1]) != RBRACKET) { *bufnext++ = M_RNG; - *bufnext++ = CHAR(c); - qpatnext += 2; + + ctype = check_classes_expr(++qpatnext, + wclass, 64); + if (ctype == DOT && + is_unicode_coll_elem (wclass)) { + *bufnext++ = + M_COLL (wcilen (wclass)); + wint_t *wcp = wclass; + while ((*bufnext++ = *wcp++)) + ; + --bufnext; /* drop NUL */ + } else + *bufnext++ = CHAR(c); + ++qpatnext; } } while ((c = *qpatnext++) != RBRACKET); pglob->gl_flags |= GLOB_MAGCHAR; @@ -849,11 +872,12 @@ static int match(Char *name, Char *pat, Char *patend) { int ok, negate_range; - Char c, k; + Char *c, *k; + size_t k_len; while (pat < patend) { - c = *pat++; - switch (c & M_MASK) { + c = pat++; + switch (*c & M_MASK) { case M_ALL: if (pat == patend) return(1); @@ -868,36 +892,53 @@ match(Char *name, Char *pat, Char *patend) break; case M_SET: ok = 0; - if ((k = *name++) == EOS) + if (*(k = name) == EOS) return(0); + k_len = next_unicode_char (k); + name += k_len; if ((negate_range = ((*pat & M_MASK) == M_NOT)) != EOS) ++pat; - while (((c = *pat++) & M_MASK) != M_END) - if ((c & M_MASK) == M_NAMED) { - if (iswctype (k, *pat++)) + while ((*(c = pat++) & M_MASK) != M_END) { + size_t len1 = 1, len2 = 1; + + if ((*c & M_MASK) == M_NAMED) { + if (iswctype (*k, *pat++)) ok = 1; - } else if ((c & M_MASK) == M_EQUIV) { - if (is_unicode_equiv (k, *pat++)) + continue; + } + if ((*c & M_MASK) == M_EQUIV) { + if (is_unicode_equiv (*k, *pat++)) ok = 1; - } else if ((*pat & M_MASK) == M_RNG) { + continue; + } + if (M_COLL_P(*c)) { + len1 = M_COLL_CNT(*c); + ++c; + pat += len1; + } + if ((*pat & M_MASK) == M_RNG) { + if (M_COLL_P(pat[1])) + len2 = M_COLL_CNT(*++pat); #ifdef __CYGWIN__ if ((!__get_current_collate_locale ()->lcid) ? #else if (__collate_load_error ? #endif - CCHAR(c) <= CCHAR(k) && CCHAR(k) <= CCHAR(pat[1]) : - __wcollate_range_cmp(CCHAR(c), CCHAR(k)) <= 0 - && __wcollate_range_cmp(CCHAR(k), CCHAR(pat[1])) <= 0 + *c <= *k && *k <= pat[1] : + __wscollate_range_cmp(c, k, len1, k_len) <= 0 + && __wscollate_range_cmp(k, pat + 1, k_len, len2) <= 0 ) ok = 1; - pat += 2; - } else if (c == k) + pat += len2 + 1; + } else if (len1 == k_len && + wcincmp (c, k, len1) == 0) ok = 1; + } if (ok == negate_range) return(0); break; default: - if (Cchar(*name++) != Cchar(c)) + if (Cchar(*name++) != Cchar(*c)) return(0); break; } diff --git a/winsup/cygwin/local_includes/collate.h b/winsup/cygwin/local_includes/collate.h index 7b4c72dd582e..498d5e1cd431 100644 --- a/winsup/cygwin/local_includes/collate.h +++ b/winsup/cygwin/local_includes/collate.h @@ -14,6 +14,7 @@ extern "C" { extern const int __collate_load_error; extern int __wcollate_range_cmp (wint_t, wint_t); +extern int __wscollate_range_cmp (wint_t *, wint_t *, size_t, size_t); int is_unicode_equiv (wint_t, wint_t); diff --git a/winsup/cygwin/nlsfuncs.cc b/winsup/cygwin/nlsfuncs.cc index 20143f19d8d3..eb9948dd37fc 100644 --- a/winsup/cygwin/nlsfuncs.cc +++ b/winsup/cygwin/nlsfuncs.cc @@ -1195,6 +1195,25 @@ __wcollate_range_cmp (wint_t c1, wint_t c2) return wcscoll (s1, s2); } +/* Not so much BSD. Used from glob.cc, fnmatch.c and regcomp.c. + + First arg is always from pattern space, second arg is the tested string. + len is the length of the pattern in the first arg. */ +extern "C" int +__wscollate_range_cmp (wint_t *c1, wint_t *c2, + size_t c1len, size_t c2len) +{ + wchar_t s1[c1len * 2 + 1] = { 0 }; /* # of chars if all are surrogates */ + wchar_t s2[c2len * 2 + 1] = { 0 }; + + wcintowcs (s1, c1, c1len); + wcintowcs (s2, c2, c2len); + return wcscoll_l (s1, s2, __get_current_locale ()); +} + +const size_t ce_size = sizeof collating_element / sizeof *collating_element; +const size_t ce_e_size = sizeof *collating_element; + /* Check if UTF-32 input character `test' is in the same equivalence class as UTF-32 character 'eqv'. Note that we only recognize input in Unicode normalization form C, that
reply other threads:[~2023-02-20 22:01 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20230220220114.1ED703858C3A@sourceware.org \ --to=corinna@sourceware.org \ --cc=cygwin-cvs@sourceware.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).