From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2155) id 1ED703858C3A; Mon, 20 Feb 2023 22:01:14 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 1ED703858C3A DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1676930474; bh=ixqDAJPFznlCl4XEQARSOyjIoQfcbCjCr9tW6QN5UKo=; h=From:To:Subject:Date:From; b=Ro4eQpMshfxfB287KLhClpOtaTyJO5MQPvcn7kmvHsS5bG5r/uQ9JyKYYrcIH2wvN p3bVWPjBzJHuP2aKxeEiohc0Gy+VSPaunHZtBedd1/4R/EE4WDiLkWT2JIbhQgwdmx 7/Esmlkd/YHzXLNeZWiyBN4JR28rhCXgmPDxlRM0= Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable From: Corinna Vinschen To: cygwin-cvs@sourceware.org Subject: [newlib-cygwin/main] Cygwin: glob: implement collating symbol support X-Act-Checkin: newlib-cygwin X-Git-Author: Corinna Vinschen X-Git-Refname: refs/heads/main X-Git-Oldrev: 244faaea8e24c70a1d61b939623364e3bdfaa28c X-Git-Newrev: ce5aa098071304cfd3bd1bd535a7571089344b1a Message-Id: <20230220220114.1ED703858C3A@sourceware.org> Date: Mon, 20 Feb 2023 22:01:14 +0000 (GMT) List-Id: https://sourceware.org/git/gitweb.cgi?p=3Dnewlib-cygwin.git;h=3Dce5aa098071= 304cfd3bd1bd535a7571089344b1a commit ce5aa098071304cfd3bd1bd535a7571089344b1a Author: Corinna Vinschen AuthorDate: Mon Feb 20 22:50:17 2023 +0100 Commit: Corinna Vinschen CommitDate: Mon Feb 20 22:50:17 2023 +0100 Cygwin: glob: implement collating symbol support =20 Allow the [..] expression =20 This requires a string comparision rather than a character comparison. Introduce and use __wscollate_range_cmp. =20 Signed-off-by: Corinna Vinschen Diff: --- winsup/cygwin/glob.cc | 115 ++++++++++++++++++++++-------= ---- winsup/cygwin/local_includes/collate.h | 1 + winsup/cygwin/nlsfuncs.cc | 19 ++++++ 3 files changed, 98 insertions(+), 37 deletions(-) diff --git a/winsup/cygwin/glob.cc b/winsup/cygwin/glob.cc index 4ef947929a58..66681786aae8 100644 --- a/winsup/cygwin/glob.cc +++ b/winsup/cygwin/glob.cc @@ -160,6 +160,9 @@ typedef char Char; #define M_SET META('[') #define M_NAMED META(':') #define M_EQUIV META('=3D') +#define M_COLL(_ccnt) META('.' | ((_ccnt) << 8)) +#define M_COLL_P(_c) (((_c) & M_COLL_MASK) =3D=3D META('.')) +#define M_COLL_CNT(_c) (((_c) & ~M_COLL_MASK) >> 8) #define ismeta(c) (((c)&M_QUOTE) !=3D 0) =20 static int compare(const void *, const void *); @@ -528,41 +531,61 @@ glob0(const Char *pattern, glob_t *pglob, size_t *lim= it) *bufnext++ =3D M_SET; if (c =3D=3D NOT) *bufnext++ =3D M_NOT; - c =3D *qpatnext; + c =3D *qpatnext++; do { wint_t wclass[64]; Char ctype; =20 - ctype =3D check_classes_expr(qpatnext, wclass, + ctype =3D check_classes_expr(--qpatnext, wclass, 64); - if (ctype) { + ++qpatnext; + if (ctype =3D=3D COLON) { wctype_t type; - - if (ctype =3D=3D COLON) { - char cclass[64]; - - /* No worries, char classes are - ASCII-only anyway */ - wcitoascii (cclass, wclass); - if ((type =3D wctype (cclass))) { - *bufnext++ =3D M_NAMED; - *bufnext++ =3D CHAR (type); - } - } else if (ctype =3D=3D EQUALS && - wclass[0] && !wclass[1]) { + char cclass[64]; + + /* No worries, char classes are + ASCII-only anyway */ + wcitoascii (cclass, wclass); + if ((type =3D wctype (cclass))) { + *bufnext++ =3D M_NAMED; + *bufnext++ =3D CHAR (type); + } + continue; + } + if (ctype =3D=3D EQUALS) { + if (wclass[0] && !wclass[1]) { *bufnext++ =3D M_EQUIV; *bufnext++ =3D CHAR (wclass[0]); } - /* TODO: [. is ignored yet */ - qpatnext++; continue; } - *bufnext++ =3D CHAR(c); + if (ctype =3D=3D DOT && + is_unicode_coll_elem (wclass)) { + *bufnext++ =3D + M_COLL (wcilen (wclass)); + wint_t *wcp =3D wclass; + while ((*bufnext++ =3D *wcp++)) + ; + --bufnext; /* drop NUL */ + } else + *bufnext++ =3D CHAR(c); if (*qpatnext =3D=3D RANGE && (c =3D qpatnext[1]) !=3D RBRACKET) { *bufnext++ =3D M_RNG; - *bufnext++ =3D CHAR(c); - qpatnext +=3D 2; + + ctype =3D check_classes_expr(++qpatnext, + wclass, 64); + if (ctype =3D=3D DOT && + is_unicode_coll_elem (wclass)) { + *bufnext++ =3D + M_COLL (wcilen (wclass)); + wint_t *wcp =3D wclass; + while ((*bufnext++ =3D *wcp++)) + ; + --bufnext; /* drop NUL */ + } else + *bufnext++ =3D CHAR(c); + ++qpatnext; } } while ((c =3D *qpatnext++) !=3D RBRACKET); pglob->gl_flags |=3D GLOB_MAGCHAR; @@ -849,11 +872,12 @@ static int match(Char *name, Char *pat, Char *patend) { int ok, negate_range; - Char c, k; + Char *c, *k; + size_t k_len; =20 while (pat < patend) { - c =3D *pat++; - switch (c & M_MASK) { + c =3D pat++; + switch (*c & M_MASK) { case M_ALL: if (pat =3D=3D patend) return(1); @@ -868,36 +892,53 @@ match(Char *name, Char *pat, Char *patend) break; case M_SET: ok =3D 0; - if ((k =3D *name++) =3D=3D EOS) + if (*(k =3D name) =3D=3D EOS) return(0); + k_len =3D next_unicode_char (k); + name +=3D k_len; if ((negate_range =3D ((*pat & M_MASK) =3D=3D M_NOT)) !=3D EOS) ++pat; - while (((c =3D *pat++) & M_MASK) !=3D M_END) - if ((c & M_MASK) =3D=3D M_NAMED) { - if (iswctype (k, *pat++)) + while ((*(c =3D pat++) & M_MASK) !=3D M_END) { + size_t len1 =3D 1, len2 =3D 1; + + if ((*c & M_MASK) =3D=3D M_NAMED) { + if (iswctype (*k, *pat++)) ok =3D 1; - } else if ((c & M_MASK) =3D=3D M_EQUIV) { - if (is_unicode_equiv (k, *pat++)) + continue; + } + if ((*c & M_MASK) =3D=3D M_EQUIV) { + if (is_unicode_equiv (*k, *pat++)) ok =3D 1; - } else if ((*pat & M_MASK) =3D=3D M_RNG) { + continue; + } + if (M_COLL_P(*c)) { + len1 =3D M_COLL_CNT(*c); + ++c; + pat +=3D len1; + } + if ((*pat & M_MASK) =3D=3D M_RNG) { + if (M_COLL_P(pat[1])) + len2 =3D M_COLL_CNT(*++pat); #ifdef __CYGWIN__ if ((!__get_current_collate_locale ()->lcid) ? #else if (__collate_load_error ? #endif - CCHAR(c) <=3D CCHAR(k) && CCHAR(k) <=3D CCHAR(pat[1]) : - __wcollate_range_cmp(CCHAR(c), CCHAR(k)) <=3D 0 - && __wcollate_range_cmp(CCHAR(k), CCHAR(pat[1])) <=3D 0 + *c <=3D *k && *k <=3D pat[1] : + __wscollate_range_cmp(c, k, len1, k_len) <=3D 0 + && __wscollate_range_cmp(k, pat + 1, k_len, len2) <=3D 0 ) ok =3D 1; - pat +=3D 2; - } else if (c =3D=3D k) + pat +=3D len2 + 1; + } else if (len1 =3D=3D k_len && + wcincmp (c, k, len1) =3D=3D 0) ok =3D 1; + } if (ok =3D=3D negate_range) return(0); break; default: - if (Cchar(*name++) !=3D Cchar(c)) + if (Cchar(*name++) !=3D Cchar(*c)) return(0); break; } diff --git a/winsup/cygwin/local_includes/collate.h b/winsup/cygwin/local_i= ncludes/collate.h index 7b4c72dd582e..498d5e1cd431 100644 --- a/winsup/cygwin/local_includes/collate.h +++ b/winsup/cygwin/local_includes/collate.h @@ -14,6 +14,7 @@ extern "C" { extern const int __collate_load_error; =20 extern int __wcollate_range_cmp (wint_t, wint_t); +extern int __wscollate_range_cmp (wint_t *, wint_t *, size_t, size_t); =20 int is_unicode_equiv (wint_t, wint_t); =20 diff --git a/winsup/cygwin/nlsfuncs.cc b/winsup/cygwin/nlsfuncs.cc index 20143f19d8d3..eb9948dd37fc 100644 --- a/winsup/cygwin/nlsfuncs.cc +++ b/winsup/cygwin/nlsfuncs.cc @@ -1195,6 +1195,25 @@ __wcollate_range_cmp (wint_t c1, wint_t c2) return wcscoll (s1, s2); } =20 +/* Not so much BSD. Used from glob.cc, fnmatch.c and regcomp.c. + + First arg is always from pattern space, second arg is the tested string. + len is the length of the pattern in the first arg. */ +extern "C" int +__wscollate_range_cmp (wint_t *c1, wint_t *c2, + size_t c1len, size_t c2len) +{ + wchar_t s1[c1len * 2 + 1] =3D { 0 }; /* # of chars if all are surrogates= */ + wchar_t s2[c2len * 2 + 1] =3D { 0 }; + + wcintowcs (s1, c1, c1len); + wcintowcs (s2, c2, c2len); + return wcscoll_l (s1, s2, __get_current_locale ()); +} + +const size_t ce_size =3D sizeof collating_element / sizeof *collating_elem= ent; +const size_t ce_e_size =3D sizeof *collating_element; + /* Check if UTF-32 input character `test' is in the same equivalence class as UTF-32 character 'eqv'. Note that we only recognize input in Unicode normalization form C, that