From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2155) id 7BAB13850857; Fri, 24 Feb 2023 15:42:10 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 7BAB13850857 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1677253330; bh=ZYlp7gN4kWb3nW/UR7rfyJ1gAv/gkkfK5Y4C2sBLXbA=; h=From:To:Subject:Date:From; b=G3a2kl6kU9axkfET3QFARILrMLbL0CzuargV3zEi1aRw6TiOY7yHj0xXojqdQGrop AYHRdRXf8/MH4eka9HZo1NbQxvOc8uGDrVVOHcpxK9rj6rgm5xADrcbE+datt7D9Sx ERylXyg/uw24rTDtqOerI6QYElDjBbz7Px2wVRxI= Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable From: Corinna Vinschen To: cygwin-cvs@sourceware.org, newlib-cvs@sourceware.org Subject: [newlib-cygwin/main] Cygwin: convert Windows locale handling from LCID to ISO5646 strings X-Act-Checkin: newlib-cygwin X-Git-Author: Corinna Vinschen X-Git-Refname: refs/heads/main X-Git-Oldrev: 89eb4bce152f93a9ace37bb7c67941a0e3bf19ae X-Git-Newrev: e95a7a795522165a2d022dbb4df281d6be661c83 Message-Id: <20230224154210.7BAB13850857@sourceware.org> Date: Fri, 24 Feb 2023 15:42:10 +0000 (GMT) List-Id: https://sourceware.org/git/gitweb.cgi?p=3Dnewlib-cygwin.git;h=3De95a7a79552= 2165a2d022dbb4df281d6be661c83 commit e95a7a795522165a2d022dbb4df281d6be661c83 Author: Corinna Vinschen AuthorDate: Fri Feb 24 16:37:44 2023 +0100 Commit: Corinna Vinschen CommitDate: Fri Feb 24 16:40:58 2023 +0100 Cygwin: convert Windows locale handling from LCID to ISO5646 strings =20 Since Windows Vista, locale handling is converted from using numeric locale identifiers (LCID) to using ISO5646 locale strings. In the meantime Windows introduced new locales which don't even have a LCID attached. Those were unusable in Cygwin because locale information for these locales required to call the new locale functions taking a locale string. =20 Convert Cygwin to drop LCIDs and use Windows ISO5646 locales instead. =20 The last place using LCIDs is the __set_charset_from_locale function. Checking numerically is easier and uslay faster than checking strings. However, this function is clearly a TODO Diff: --- newlib/libc/locale/setlocale.h | 2 +- winsup/cygwin/glob.cc | 2 +- winsup/cygwin/libc/fnmatch.c | 2 +- winsup/cygwin/nlsfuncs.cc | 449 ++++++++++++++++++++++---------------= ---- winsup/cygwin/regex/regcomp.c | 2 +- 5 files changed, 243 insertions(+), 214 deletions(-) diff --git a/newlib/libc/locale/setlocale.h b/newlib/libc/locale/setlocale.h index 3530ec664dc2..7abf92f7ec2a 100644 --- a/newlib/libc/locale/setlocale.h +++ b/newlib/libc/locale/setlocale.h @@ -46,7 +46,7 @@ __BEGIN_DECLS #ifdef __CYGWIN__ struct lc_collate_T { - __uint32_t lcid; + wchar_t win_locale[ENCODING_LEN + 1]; int (*mbtowc) (struct _reent *, wchar_t *, const char *, size_t, mbstate_t *); char codeset[ENCODING_LEN + 1]; diff --git a/winsup/cygwin/glob.cc b/winsup/cygwin/glob.cc index 64c3507d891e..63a07a930f06 100644 --- a/winsup/cygwin/glob.cc +++ b/winsup/cygwin/glob.cc @@ -929,7 +929,7 @@ match(Char *name, Char *pat, Char *patend) if (M_COLL_P(pat[1])) len2 =3D M_COLL_CNT(*++pat); #ifdef __CYGWIN__ - if ((!__get_current_collate_locale ()->lcid) ? + if ((!__get_current_collate_locale ()->win_locale[0]) ? #else if (__collate_load_error ? #endif diff --git a/winsup/cygwin/libc/fnmatch.c b/winsup/cygwin/libc/fnmatch.c index dfbe3863fbfa..beb67f01e5d1 100644 --- a/winsup/cygwin/libc/fnmatch.c +++ b/winsup/cygwin/libc/fnmatch.c @@ -330,7 +330,7 @@ rangematch(const char *pattern, wint_t test, int flags,= char **newp, c2 =3D towlower(c2); =20 #ifdef __CYGWIN__ - if ((!__get_current_collate_locale ()->lcid) ? + if ((!__get_current_collate_locale ()->win_locale[0]) ? #else if (table->__collate_load_error ? #endif diff --git a/winsup/cygwin/nlsfuncs.cc b/winsup/cygwin/nlsfuncs.cc index 0d2764b05847..60bababd2668 100644 --- a/winsup/cygwin/nlsfuncs.cc +++ b/winsup/cygwin/nlsfuncs.cc @@ -25,128 +25,129 @@ details. */ #define _LC(x) &lc_##x##_ptr,lc_##x##_end-lc_##x##_ptr =20 #define getlocaleinfo(category,type) \ - __getlocaleinfo(lcid,(type),_LC(category)) + __getlocaleinfo(win_locale,(type),_LC(category)) +#define getlocaleint(type) \ + __getlocaleint(win_locale,(type)) #define setlocaleinfo(category,val) \ __setlocaleinfo(_LC(category),(val)) #define eval_datetimefmt(type,flags) \ - __eval_datetimefmt(lcid,(type),(flags),&lc_time_ptr,\ + __eval_datetimefmt(win_locale,(type),(flags),&lc_time_ptr,\ lc_time_end-lc_time_ptr) #define charfromwchar(category,in) \ __charfromwchar (_##category##_locale->in,_LC(category),f_wctomb) =20 #define has_modifier(x) ((x)[0] && !strcmp (modifier, (x))) =20 -static char last_locale[ENCODING_LEN + 1]; -static LCID last_lcid; - -/* Fetch LCID from POSIX locale specifier. +/* Fetch Windows RFC 5646 locale from POSIX locale specifier. Return values: =20 -1: Invalid locale 0: C or POSIX - >0: LCID + 1: valid locale */ -static LCID -__get_lcid_from_locale (const char *name) +static int +__get_rfc5646_from_locale (const char *name, wchar_t *win_locale) { - char locale[ENCODING_LEN + 1]; - char *c; - LCID lcid; - - /* Speed up reusing the same locale as before, for instance in LC_ALL ca= se. */ - if (!strcmp (name, last_locale)) - { - debug_printf ("LCID=3D%04y", last_lcid); - return last_lcid; - } - stpcpy (last_locale, name); - stpcpy (locale, name); - /* Store modifier for later use. */ - const char *modifier =3D strchr (last_locale, '@') ? : ""; + wchar_t wlocale[ENCODING_LEN + 1] =3D { 0 }; + wchar_t locale[ENCODING_LEN + 1]; + wchar_t *c; + + win_locale[0] =3D L'\0'; + mbstowcs (locale, name, ENCODING_LEN + 1); + /* Remember modifier for later use. */ + const char *modifier =3D strchr (name, '@') ? : ""; /* Drop charset and modifier */ - c =3D strchr (locale, '.'); + c =3D wcschr (locale, L'.'); if (!c) - c =3D strchr (locale, '@'); + c =3D wcschr (locale, L'@'); if (c) - *c =3D '\0'; + *c =3D L'\0'; /* "POSIX" already converted to "C" in loadlocale. */ - if (!strcmp (locale, "C")) - return last_lcid =3D 0; - c =3D strchr (locale, '_'); + if (!wcscmp (locale, L"C")) + return 0; + c =3D wcschr (locale, '_'); if (!c) - return last_lcid =3D (LCID) -1; - - wchar_t wlocale[ENCODING_LEN + 1]; + { + /* try if the locale can be resolved from the language tag + fix up Linux-only locale first */ + if (!wcscmp (locale, L"ber")) + wcscpy (locale, L"tzm"); + if (ResolveLocaleName (locale, wlocale, ENCODING_LEN + 1) <=3D 0) + return -1; + wcpcpy (win_locale, wlocale); + return 1; + } =20 - /* Convert to RFC 4646 syntax. */ + /* Convert to RFC 5646 syntax. */ *c =3D '-'; - mbstowcs (wlocale, locale, ENCODING_LEN + 1); - lcid =3D LocaleNameToLCID (wlocale, 0); - /* Bug on Windows 10: LocaleNameToLCID returns LOCALE_CUSTOM_UNSPECIFIED - for unknown locales. */ - if (lcid =3D=3D 0 || lcid =3D=3D LOCALE_CUSTOM_UNSPECIFIED) + /* Override a few locales with a different default script as used + on Linux. Linux also supports no_NO which is equivalent to nb_NO, + but Windows can resolve that nicely. Also, "tzm" and "zgh" are + subsumed under "ber" on Linux. */ + struct { + const wchar_t *loc; + const wchar_t *wloc; + } override_locale[] =3D { + { L"ber-DZ" , L"tzm-Latn-DZ" }, + { L"ber-MA" , L"zgh-Tfng-MA" }, + { L"mn-CN" , L"mn-Mong-CN" }, + { L"mn-MN" , L"mn-Mong-MN" }, + { L"pa-PK" , L"pa-Arab-PK" }, + { L"sd-IN" , L"sd-Deva-IN" }, + { L"sr-BA" , L"sr-Cyrl-BA" }, + { L"sr-ME" , L"sr-Cyrl-ME" }, + { L"sr-RS" , L"sr-Cyrl-RS" }, + { L"sr-XK" , L"sr-Cyrl-XK" }, + { L"tzm-MA", L"tzm-Tfng-MA" }, + { NULL , NULL } + }; + + for (int i =3D 0; override_locale[i].loc + && override_locale[i].loc[0] <=3D locale[0]; ++i) { - /* Unfortunately there are a couple of locales for which no form - without a Script part per RFC 4646 exists. - Linux also supports no_NO which is equivalent to nb_NO. */ - struct { - const char *loc; - const wchar_t *wloc; - } sc_only_locale[] =3D { - { "az-AZ" , L"az-Latn-AZ" }, - { "bs-BA" , L"bs-Latn-BA" }, - { "chr-US", L"chr-Cher-US"}, - { "ff-SN" , L"ff-Latn-SN" }, - { "ha-NG" , L"ha-Latn-NG" }, - { "iu-CA" , L"iu-Latn-CA" }, - { "ks-IN" , L"ks-Arab-IN" }, - { "ku-IQ" , L"ku-Arab-IQ" }, - { "mn-CN" , L"mn-Mong-CN" }, - { "mn-MN" , L"mn-Mong-MN" }, - { "no-NO" , L"nb-NO" }, - { "pa-PK" , L"pa-Arab-PK" }, - { "quc-GT", L"quc-Latn-GT" }, - { "sd-PK" , L"sd-Arab-PK" }, - { "sd-IN" , L"sd-Deva-IN" }, - { "sr-BA" , L"sr-Cyrl-BA" }, - { "sr-ME" , L"sr-Cyrl-ME" }, - { "sr-RS" , L"sr-Cyrl-RS" }, - { "tg-TJ" , L"tg-Cyrl-TJ" }, - { "tzm-DZ", L"tzm-Latn-DZ" }, - { "tzm-MA", L"tzm-Tfng-MA" }, - { "uz-UZ" , L"uz-Latn-UZ" }, - { NULL , NULL } - }; - for (int i =3D 0; sc_only_locale[i].loc - && sc_only_locale[i].loc[0] <=3D locale[0]; ++i) - if (!strcmp (locale, sc_only_locale[i].loc)) - { - lcid =3D LocaleNameToLCID (sc_only_locale[i].wloc, 0); - if (!strncmp (locale, "sr-", 3)) - { - /* "@latin" modifier for the sr_XY locales changes - collation behaviour so lcid should accommodate that - by being set to the Latin sublang. */ - if (lcid !=3D 0 && lcid !=3D LOCALE_CUSTOM_UNSPECIFIED - && has_modifier ("@latin")) - lcid =3D MAKELANGID (lcid & 0x3ff, (lcid >> 10) - 1); - } - else if (!strncmp (locale, "uz-", 3)) - { - /* Equivalent for "@cyrillic" modifier in uz_UZ locale */ - if (lcid !=3D 0 && lcid !=3D LOCALE_CUSTOM_UNSPECIFIED - && has_modifier ("@cyrillic")) - lcid =3D MAKELANGID (lcid & 0x3ff, (lcid >> 10) + 1); - } - break; - } + if (!wcscmp (locale, override_locale[i].loc)) + { + wcscpy (wlocale, override_locale[i].wloc); + break; + } + } + if (!wlocale[0] + && ResolveLocaleName (locale, wlocale, ENCODING_LEN + 1) <=3D 1) + return -1; + + /* Check for modifiers changing the script */ + const wchar_t *iso15924_script[] =3D { L"Latn-", L"Cyrl-", L"Deva-", L"A= dlm-" }; + int idx =3D -1; + + if (modifier[0]) + { + if (!strcmp (++modifier, "latin")) + idx =3D 0; + else if (!strcmp (modifier, "cyrillic")) + idx =3D 1; + else if (!strcmp (modifier, "devanagari")) + idx =3D 2; + else if (!strcmp (modifier, "adlam")) + idx =3D 3; + } + if (idx >=3D 0) + { + wchar_t *iso3166 =3D wcschr (wlocale, L'-') + 1; + wchar_t *wlp; + + /* Copy iso639 language part including dash */ + wlp =3D wcpncpy (win_locale, wlocale, iso3166 - wlocale); + /* Concat new iso15924 script */ + wlp =3D wcpcpy (wlp, iso15924_script[idx]); + /* Concat iso3166 territory. Skip script, if already in the locale = */ + wchar_t *skip_script =3D wcschr (iso3166, L'-'); + if (skip_script) + iso3166 =3D skip_script + 1; + wcpcpy (wlp, iso3166); } - if (lcid && lcid !=3D LOCALE_CUSTOM_UNSPECIFIED) - last_lcid =3D lcid; else - last_lcid =3D (LCID) -1; - debug_printf ("LCID=3D%04y", last_lcid); - return last_lcid; + wcpcpy (win_locale, wlocale); + return 1; } =20 /* Never returns -1. Just skips invalid chars instead. Only if return_in= valid @@ -257,7 +258,7 @@ rebase_locale_buf (const void *ptrv, const void *ptrven= d, const char *newbase, } =20 static wchar_t * -__getlocaleinfo (LCID lcid, LCTYPE type, char **ptr, size_t size) +__getlocaleinfo (wchar_t *loc, LCTYPE type, char **ptr, size_t size) { size_t num; wchar_t *ret; @@ -265,7 +266,7 @@ __getlocaleinfo (LCID lcid, LCTYPE type, char **ptr, si= ze_t size) if ((uintptr_t) *ptr % 1) ++*ptr; ret =3D (wchar_t *) *ptr; - num =3D GetLocaleInfoW (lcid, type, ret, size / sizeof (wchar_t)); + num =3D GetLocaleInfoEx (loc, type, ret, size / sizeof (wchar_t)); *ptr =3D (char *) (ret + num); return ret; } @@ -296,10 +297,10 @@ __charfromwchar (const wchar_t *in, char **ptr, size_= t size, wctomb_p f_wctomb) } =20 static UINT -getlocaleint (LCID lcid, LCTYPE type) +__getlocaleint (wchar_t *loc, LCTYPE type) { UINT val; - return GetLocaleInfoW (lcid, type | LOCALE_RETURN_NUMBER, (PWCHAR) &val, + return GetLocaleInfoEx (loc, type | LOCALE_RETURN_NUMBER, (PWCHAR) &val, sizeof val) ? val : 0; } =20 @@ -310,7 +311,7 @@ enum dt_flags { }; =20 static wchar_t * -__eval_datetimefmt (LCID lcid, LCTYPE type, dt_flags flags, char **ptr, +__eval_datetimefmt (wchar_t *loc, LCTYPE type, dt_flags flags, char **ptr, size_t size) { wchar_t buf[80]; @@ -327,7 +328,7 @@ __eval_datetimefmt (LCID lcid, LCTYPE type, dt_flags fl= ags, char **ptr, ++*ptr; wchar_t *ret =3D (wchar_t *) *ptr; wchar_t *p =3D (wchar_t *) *ptr; - GetLocaleInfoW (lcid, type, buf, 80); + GetLocaleInfoEx (loc, type, buf, 80); for (wchar_t *fmt =3D buf; *fmt; ++fmt) switch (fc =3D *fmt) { @@ -390,20 +391,21 @@ __eval_datetimefmt (LCID lcid, LCTYPE type, dt_flags = flags, char **ptr, =20 /* Convert Windows grouping format into POSIX grouping format. */ static char * -conv_grouping (LCID lcid, LCTYPE type, char **lc_ptr) +conv_grouping (wchar_t *loc, LCTYPE type, char **lc_ptr) { - char buf[10]; /* Per MSDN max size of LOCALE_SGROUPING element incl. NUL= */ + wchar_t buf[10]; /* Per MSDN max size of LOCALE_SGROUPING element incl. = NUL */ bool repeat =3D false; char *ptr =3D *lc_ptr; char *ret =3D ptr; =20 - GetLocaleInfoA (lcid, type, buf, 10); - /* Convert Windows grouping format into POSIX grouping format. */ - for (char *c =3D buf; *c; ++c) + GetLocaleInfoEx (loc, type, buf, 10); + /* Convert Windows grouping format into POSIX grouping format. Note that + only ASCII chars are used in the grouping format. */ + for (wchar_t *c =3D buf; *c; ++c) { - if (*c < '0' || *c > '9') + if (*c < L'0' || *c > L'9') continue; - char val =3D *c - '0'; + char val =3D *c - L'0'; if (!val) { repeat =3D true; @@ -429,10 +431,11 @@ __set_lc_time_from_win (const char *name, char **lc_time_buf, wctomb_p f_wctomb, const char *charset) { - LCID lcid =3D __get_lcid_from_locale (name); - if (lcid =3D=3D (LCID) -1) - return lcid; - if (!lcid && !strcmp (charset, "ASCII")) + wchar_t win_locale[ENCODING_LEN + 1]; + int ret =3D __get_rfc5646_from_locale (name, win_locale); + if (ret < 0) + return ret; + if (!ret && !strcmp (charset, "ASCII")) return 0; =20 # define MAX_TIME_BUFFER_SIZE 4096 @@ -445,17 +448,17 @@ __set_lc_time_from_win (const char *name, char *lc_time_ptr =3D new_lc_time_buf; =20 /* C.foo is just a copy of "C" with fixed charset. */ - if (!lcid) + if (!ret) memcpy (_time_locale, _C_time_locale, sizeof (struct lc_time_T)); /* codeset */ _time_locale->codeset =3D lc_time_ptr; lc_time_ptr =3D stpcpy (lc_time_ptr, charset) + 1; =20 - if (lcid) + if (ret) { char locale[ENCODING_LEN + 1]; strcpy (locale, name); - /* Removes the charset from the locale and attach the modifer to the + /* Removes the charset from the locale and attach the modifier to the language_TERRITORY part. */ char *c =3D strchr (locale, '.'); if (c) @@ -476,17 +479,22 @@ __set_lc_time_from_win (const char *name, sizeof *lc_era, locale_cmp); =20 /* mon */ - /* Windows has a bug in Japanese and Korean locales. In these - locales, strings returned for LOCALE_SABBREVMONTHNAME* are missing - the suffix representing a month. Unfortunately this is not - documented in English. A Japanese article describing the problem - is http://msdn.microsoft.com/ja-jp/library/cc422084.aspx + /* Windows has a bug in "ja-JP" and "ko-KR" (but not in "ko-KP"). + In these locales, strings returned for LOCALE_SABBREVMONTHNAME* + are missing the suffix representing a month. + + A Japanese article describing the problem was + https://msdn.microsoft.com/ja-jp/library/cc422084.aspx, which is + only available via + https://web.archive.org/web/20110922195821/https://msdn.microsoft.com/ja= -jp/library/cc422084.aspx + these days. Testing indicates that this problem is still present + in Windows 11. + The workaround is to use LOCALE_SMONTHNAME* in these locales, even for the abbreviated month name. */ - const LCTYPE mon_base =3D - lcid =3D=3D MAKELANGID (LANG_JAPANESE, SUBLANG_JAPANESE_JAPAN) - || lcid =3D=3D MAKELANGID (LANG_KOREAN, SUBLANG_KOREAN) - ? LOCALE_SMONTHNAME1 : LOCALE_SABBREVMONTHNAME1; + const LCTYPE mon_base =3D !wcscmp (win_locale, L"ja-JP") + || !wcscmp (win_locale, L"ko-KR") + ? LOCALE_SMONTHNAME1 : LOCALE_SABBREVMONTHNAME1; for (int i =3D 0; i < 12; ++i) { _time_locale->wmon[i] =3D getlocaleinfo (time, mon_base + i); @@ -495,7 +503,8 @@ __set_lc_time_from_win (const char *name, /* month and alt_month */ for (int i =3D 0; i < 12; ++i) { - _time_locale->wmonth[i] =3D getlocaleinfo (time, LOCALE_SMONTHNAME1 + i= ); + _time_locale->wmonth[i] =3D getlocaleinfo (time, + LOCALE_SMONTHNAME1 + i); _time_locale->month[i] =3D _time_locale->alt_month[i] =3D charfromwchar (time, wmonth[i]); } @@ -570,7 +579,7 @@ __set_lc_time_from_win (const char *name, /* md */ { wchar_t buf[80]; - GetLocaleInfoW (lcid, LOCALE_IDATE, buf, 80); + GetLocaleInfoEx (win_locale, LOCALE_IDATE, buf, 80); _time_locale->md_order =3D (const char *) lc_time_ptr; lc_time_ptr =3D stpcpy (lc_time_ptr, *buf =3D=3D L'1' ? "dm" : "md") + 1; } @@ -690,10 +699,11 @@ __set_lc_ctype_from_win (const char *name, char **lc_ctype_buf, wctomb_p f_wctomb, const char *charset, int mb_cur_max) { - LCID lcid =3D __get_lcid_from_locale (name); - if (lcid =3D=3D (LCID) -1) - return lcid; - if (!lcid && !strcmp (charset, "ASCII")) + wchar_t win_locale[ENCODING_LEN + 1]; + int ret =3D __get_rfc5646_from_locale (name, win_locale); + if (ret < 0) + return ret; + if (!ret && !strcmp (charset, "ASCII")) return 0; =20 # define MAX_CTYPE_BUFFER_SIZE 256 @@ -704,7 +714,7 @@ __set_lc_ctype_from_win (const char *name, return -1; char *lc_ctype_ptr =3D new_lc_ctype_buf; /* C.foo is just a copy of "C" with fixed charset. */ - if (!lcid) + if (!ret) memcpy (_ctype_locale, _C_ctype_locale, sizeof (struct lc_ctype_T)); /* codeset */ _ctype_locale->codeset =3D lc_ctype_ptr; @@ -713,11 +723,11 @@ __set_lc_ctype_from_win (const char *name, _ctype_locale->mb_cur_max =3D lc_ctype_ptr; *lc_ctype_ptr++ =3D mb_cur_max; *lc_ctype_ptr++ =3D '\0'; - if (lcid) + if (ret) { /* outdigits and woutdigits */ wchar_t digits[11]; - GetLocaleInfoW (lcid, LOCALE_SNATIVEDIGITS, digits, 11); + GetLocaleInfoEx (win_locale, LOCALE_SNATIVEDIGITS, digits, 11); for (int i =3D 0; i <=3D 9; ++i) { mbstate_t state; @@ -762,10 +772,11 @@ __set_lc_numeric_from_win (const char *name, char **lc_numeric_buf, wctomb_p f_wctomb, const char *charset) { - LCID lcid =3D __get_lcid_from_locale (name); - if (lcid =3D=3D (LCID) -1) - return lcid; - if (!lcid && !strcmp (charset, "ASCII")) + wchar_t win_locale[ENCODING_LEN + 1]; + int ret =3D __get_rfc5646_from_locale (name, win_locale); + if (ret < 0) + return ret; + if (!ret && !strcmp (charset, "ASCII")) return 0; =20 # define MAX_NUMERIC_BUFFER_SIZE 256 @@ -777,20 +788,20 @@ __set_lc_numeric_from_win (const char *name, return -1; char *lc_numeric_ptr =3D new_lc_numeric_buf; /* C.foo is just a copy of "C" with fixed charset. */ - if (!lcid) + if (!ret) memcpy (_numeric_locale, _C_numeric_locale, sizeof (struct lc_numeric_= T)); else { /* decimal_point and thousands_sep */ - if (lcid =3D=3D 0x0429) /* fa_IR. Windows decimal_point is slash, - correct is dot */ + /* fa_IR. Windows decimal_point is slash, correct is dot */ + if (!wcscmp (win_locale, L"fa-IR")) { _numeric_locale->wdecimal_point =3D setlocaleinfo (numeric, L'.'); _numeric_locale->wthousands_sep =3D setlocaleinfo (numeric, L','); } - else if (lcid =3D=3D 0x0463) /* ps_AF. Windows decimal_point is dot, - thousands_sep is comma, correct are - arabic separators. */ + /* ps_AF. Windows decimal_point is dot, thousands_sep is comma, + correct are arabic separators. */ + else if (!wcscmp (win_locale, L"ps-AF")) { _numeric_locale->wdecimal_point =3D setlocaleinfo (numeric, 0x066b); _numeric_locale->wthousands_sep =3D setlocaleinfo (numeric, 0x066c); @@ -805,7 +816,7 @@ __set_lc_numeric_from_win (const char *name, _numeric_locale->decimal_point =3D charfromwchar (numeric, wdecimal_= point); _numeric_locale->thousands_sep =3D charfromwchar (numeric, wthousand= s_sep); /* grouping */ - _numeric_locale->grouping =3D conv_grouping (lcid, LOCALE_SGROUPING, + _numeric_locale->grouping =3D conv_grouping (win_locale, LOCALE_SGRO= UPING, &lc_numeric_ptr); } /* codeset */ @@ -837,10 +848,11 @@ __set_lc_monetary_from_win (const char *name, char **lc_monetary_buf, wctomb_p f_wctomb, const char *charset) { - LCID lcid =3D __get_lcid_from_locale (name); - if (lcid =3D=3D (LCID) -1) - return lcid; - if (!lcid && !strcmp (charset, "ASCII")) + wchar_t win_locale[ENCODING_LEN + 1]; + int ret =3D __get_rfc5646_from_locale (name, win_locale); + if (ret < 0) + return ret; + if (!ret && !strcmp (charset, "ASCII")) return 0; =20 # define MAX_MONETARY_BUFFER_SIZE 512 @@ -852,7 +864,7 @@ __set_lc_monetary_from_win (const char *name, return -1; char *lc_monetary_ptr =3D new_lc_monetary_buf; /* C.foo is just a copy of "C" with fixed charset. */ - if (!lcid) + if (!ret) memcpy (_monetary_locale, _C_monetary_locale, sizeof (struct lc_moneta= ry_T)); else { @@ -881,11 +893,11 @@ __set_lc_monetary_from_win (const char *name, _monetary_locale->currency_symbol =3D charfromwchar (monetary, wcurrency_symbol); /* mon_decimal_point and mon_thousands_sep */ - if (lcid =3D=3D 0x0429 || lcid =3D=3D 0x0463) /* fa_IR or ps_AF. Wi= ndows - mon_decimal_point is slash - and comma, mon_thousands_sep - is comma and dot, correct - are arabic separators. */ + /* fa_IR or ps_AF. Windows mon_decimal_point is slash and comma, + mon_thousands_sep is comma and dot, correct + are arabic separators. */ + if (!wcscmp (win_locale, L"fa-IR") + || !wcscmp (win_locale, L"ps-AF")) { _monetary_locale->wmon_decimal_point =3D setlocaleinfo (monetary, 0x066b); @@ -904,7 +916,8 @@ __set_lc_monetary_from_win (const char *name, _monetary_locale->mon_thousands_sep =3D charfromwchar (monetary, wmon_thousands_sep); /* mon_grouping */ - _monetary_locale->mon_grouping =3D conv_grouping (lcid, LOCALE_SMONG= ROUPING, + _monetary_locale->mon_grouping =3D conv_grouping (win_locale, + LOCALE_SMONGROUPING, &lc_monetary_ptr); /* positive_sign */ _monetary_locale->wpositive_sign =3D getlocaleinfo (monetary, @@ -915,33 +928,33 @@ __set_lc_monetary_from_win (const char *name, LOCALE_SNEGATIVESIGN); _monetary_locale->negative_sign =3D charfromwchar (monetary, wnegati= ve_sign); /* int_frac_digits */ - *lc_monetary_ptr =3D (char) getlocaleint (lcid, LOCALE_IINTLCURRDIGI= TS); + *lc_monetary_ptr =3D (char) getlocaleint (LOCALE_IINTLCURRDIGITS); _monetary_locale->int_frac_digits =3D lc_monetary_ptr++; /* frac_digits */ - *lc_monetary_ptr =3D (char) getlocaleint (lcid, LOCALE_ICURRDIGITS); + *lc_monetary_ptr =3D (char) getlocaleint (LOCALE_ICURRDIGITS); _monetary_locale->frac_digits =3D lc_monetary_ptr++; /* p_cs_precedes and int_p_cs_precedes */ - *lc_monetary_ptr =3D (char) getlocaleint (lcid, LOCALE_IPOSSYMPRECED= ES); + *lc_monetary_ptr =3D (char) getlocaleint (LOCALE_IPOSSYMPRECEDES); _monetary_locale->p_cs_precedes =3D _monetary_locale->int_p_cs_precedes =3D lc_monetary_ptr++; /* p_sep_by_space and int_p_sep_by_space */ - *lc_monetary_ptr =3D (char) getlocaleint (lcid, LOCALE_IPOSSEPBYSPAC= E); + *lc_monetary_ptr =3D (char) getlocaleint (LOCALE_IPOSSEPBYSPACE); _monetary_locale->p_sep_by_space =3D _monetary_locale->int_p_sep_by_space =3D lc_monetary_ptr++; /* n_cs_precedes and int_n_cs_precedes */ - *lc_monetary_ptr =3D (char) getlocaleint (lcid, LOCALE_INEGSYMPRECED= ES); + *lc_monetary_ptr =3D (char) getlocaleint (LOCALE_INEGSYMPRECEDES); _monetary_locale->n_cs_precedes =3D _monetary_locale->int_n_cs_precedes =3D lc_monetary_ptr++; /* n_sep_by_space and int_n_sep_by_space */ - *lc_monetary_ptr =3D (char) getlocaleint (lcid, LOCALE_INEGSEPBYSPAC= E); + *lc_monetary_ptr =3D (char) getlocaleint (LOCALE_INEGSEPBYSPACE); _monetary_locale->n_sep_by_space =3D _monetary_locale->int_n_sep_by_space =3D lc_monetary_ptr++; /* p_sign_posn and int_p_sign_posn */ - *lc_monetary_ptr =3D (char) getlocaleint (lcid, LOCALE_IPOSSIGNPOSN); + *lc_monetary_ptr =3D (char) getlocaleint (LOCALE_IPOSSIGNPOSN); _monetary_locale->p_sign_posn =3D _monetary_locale->int_p_sign_posn =3D lc_monetary_ptr++; /* n_sign_posn and int_n_sign_posn */ - *lc_monetary_ptr =3D (char) getlocaleint (lcid, LOCALE_INEGSIGNPOSN); + *lc_monetary_ptr =3D (char) getlocaleint (LOCALE_INEGSIGNPOSN); _monetary_locale->n_sign_posn =3D _monetary_locale->int_n_sign_posn =3D lc_monetary_ptr++; } @@ -970,10 +983,11 @@ __set_lc_messages_from_win (const char *name, char **lc_messages_buf, wctomb_p f_wctomb, const char *charset) { - LCID lcid =3D __get_lcid_from_locale (name); - if (lcid =3D=3D (LCID) -1) - return lcid; - if (!lcid && !strcmp (charset, "ASCII")) + wchar_t win_locale[ENCODING_LEN + 1]; + int ret =3D __get_rfc5646_from_locale (name, win_locale); + if (ret < 0) + return ret; + if (!ret && !strcmp (charset, "ASCII")) return 0; =20 char locale[ENCODING_LEN + 1]; @@ -981,7 +995,7 @@ __set_lc_messages_from_win (const char *name, lc_msg_t *msg =3D NULL; =20 /* C.foo is just a copy of "C" with fixed charset. */ - if (!lcid) + if (!ret) memcpy (_messages_locale, _C_messages_locale, sizeof (struct lc_messag= es_T)); else { @@ -1012,7 +1026,7 @@ __set_lc_messages_from_win (const char *name, target charset are simply ignored, as on Linux. */ size_t len =3D 0; len +=3D (strlen (charset) + 1); - if (lcid) + if (ret) { len +=3D lc_wcstombs (f_wctomb, NULL, msg->yesexpr, 0) + 1; len +=3D lc_wcstombs (f_wctomb, NULL, msg->noexpr, 0) + 1; @@ -1036,7 +1050,7 @@ __set_lc_messages_from_win (const char *name, /* codeset */ _messages_locale->codeset =3D c; c =3D stpcpy (c, charset) + 1; - if (lcid) + if (ret) { _messages_locale->yesexpr =3D (const char *) c; len =3D lc_wcstombs (f_wctomb, c, msg->yesexpr, lc_messages_end - c); @@ -1065,7 +1079,7 @@ __set_lc_messages_from_win (const char *name, =20 const struct lc_collate_T _C_collate_locale =3D { - 0, + L"", __ascii_mbtowc, "ASCII" }; @@ -1080,10 +1094,11 @@ __collate_load_locale (struct __locale_t *locale, c= onst char *name, char *bufp =3D NULL; struct lc_collate_T *cop =3D NULL; =20 - LCID lcid =3D __get_lcid_from_locale (name); - if (lcid =3D=3D (LCID) -1) - return -1; - if (lcid) + wchar_t win_locale[ENCODING_LEN + 1]; + int ret =3D __get_rfc5646_from_locale (name, win_locale); + if (ret < 0) + return ret; + if (ret) { bufp =3D (char *) malloc (1); /* dummy */ if (!bufp) @@ -1094,12 +1109,12 @@ __collate_load_locale (struct __locale_t *locale, c= onst char *name, free (bufp); return -1; } - cop->lcid =3D lcid; + wcscpy (cop->win_locale, win_locale); cop->mbtowc =3D (mbtowc_p) f_mbtowc; stpcpy (cop->codeset, charset); } struct __lc_cats tmp =3D locale->lc_cat[LC_COLLATE]; - locale->lc_cat[LC_COLLATE].ptr =3D lcid =3D=3D 0 ? &_C_collate_locale : = cop; + locale->lc_cat[LC_COLLATE].ptr =3D !win_locale[0] ? &_C_collate_locale := cop; locale->lc_cat[LC_COLLATE].buf =3D bufp; /* If buf is not NULL, both pointers have been alloc'ed */ if (tmp.buf) @@ -1119,11 +1134,11 @@ wcscoll_l (const wchar_t *__restrict ws1, const wch= ar_t *__restrict ws2, struct __locale_t *locale) { int ret; - LCID collate_lcid =3D __get_collate_locale (locale)->lcid; + const wchar_t *collate_locale =3D __get_collate_locale (locale)->win_loc= ale; =20 - if (!collate_lcid) + if (!collate_locale[0]) return wcscmp (ws1, ws2); - ret =3D CompareStringW (collate_lcid, 0, ws1, -1, ws2, -1); + ret =3D CompareStringEx (collate_locale, 0, ws1, -1, ws2, -1, NULL, NULL= , 0); if (!ret) set_errno (EINVAL); return ret - CSTR_EQUAL; @@ -1143,12 +1158,10 @@ strcoll_l (const char *__restrict s1, const char *_= _restrict s2, wchar_t *ws1, *ws2; tmp_pathbuf tp; int ret; - LCID collate_lcid =3D __get_collate_locale (locale)->lcid; + const wchar_t *collate_locale =3D __get_collate_locale (locale)->win_loc= ale; =20 - if (!collate_lcid) + if (!collate_locale[0]) return strcmp (s1, s2); - /* The ANSI version of CompareString uses the default charset of the lci= d, - so we must use the Unicode version. */ mbtowc_p collate_mbtowc =3D __get_collate_locale (locale)->mbtowc; n1 =3D lc_mbstowcs (collate_mbtowc, NULL, s1, 0) + 1; ws1 =3D (n1 > NT_MAX_PATH ? (wchar_t *) malloc (n1 * sizeof (wchar_t)) @@ -1158,7 +1171,7 @@ strcoll_l (const char *__restrict s1, const char *__r= estrict s2, ws2 =3D (n2 > NT_MAX_PATH ? (wchar_t *) malloc (n2 * sizeof (wchar_t)) : tp.w_get ()); lc_mbstowcs (collate_mbtowc, ws2, s2, n2); - ret =3D CompareStringW (collate_lcid, 0, ws1, -1, ws2, -1); + ret =3D CompareStringEx (collate_locale, 0, ws1, -1, ws2, -1, NULL, NULL= , 0); if (n1 > NT_MAX_PATH) free (ws1); if (n2 > NT_MAX_PATH) @@ -1350,15 +1363,15 @@ wcsxfrm_l (wchar_t *__restrict ws1, const wchar_t *= __restrict ws2, size_t wsn, struct __locale_t *locale) { size_t ret; - LCID collate_lcid =3D __get_collate_locale (locale)->lcid; + const wchar_t *collate_locale =3D __get_collate_locale (locale)->win_loc= ale; =20 - if (!collate_lcid) + if (!collate_locale[0]) return wcslcpy (ws1, ws2, wsn); /* Don't use LCMAP_SORTKEY in conjunction with LCMAP_BYTEREV. The cchDe= st parameter is used as byte count with LCMAP_SORTKEY but as char count = with LCMAP_BYTEREV. */ - ret =3D LCMapStringW (collate_lcid, LCMAP_SORTKEY, ws2, -1, ws1, - wsn * sizeof (wchar_t)); + ret =3D LCMapStringEx (collate_locale, LCMAP_SORTKEY, ws2, -1, ws1, + wsn * sizeof (wchar_t), NULL, NULL, 0); if (ret) { ret /=3D sizeof (wchar_t); @@ -1380,7 +1393,8 @@ wcsxfrm_l (wchar_t *__restrict ws1, const wchar_t *__= restrict ws2, size_t wsn, set_errno (EINVAL); else { - ret =3D LCMapStringW (collate_lcid, LCMAP_SORTKEY, ws2, -1, NULL, 0); + ret =3D LCMapStringEx (collate_locale, LCMAP_SORTKEY, ws2, -1, + NULL, 0, NULL, NULL, 0); if (ret) wsn =3D ret / sizeof (wchar_t); } @@ -1401,12 +1415,10 @@ strxfrm_l (char *__restrict s1, const char *__restr= ict s2, size_t sn, size_t n2; wchar_t *ws2; tmp_pathbuf tp; - LCID collate_lcid =3D __get_collate_locale (locale)->lcid; + const wchar_t *collate_locale =3D __get_collate_locale (locale)->win_loc= ale; =20 - if (!collate_lcid) + if (!collate_locale[0]) return strlcpy (s1, s2, sn); - /* The ANSI version of LCMapString uses the default charset of the lcid, - so we must use the Unicode version. */ mbtowc_p collate_mbtowc =3D __get_collate_locale (locale)->mbtowc; n2 =3D lc_mbstowcs (collate_mbtowc, NULL, s2, 0) + 1; ws2 =3D (n2 > NT_MAX_PATH ? (wchar_t *) malloc (n2 * sizeof (wchar_t)) @@ -1415,8 +1427,8 @@ strxfrm_l (char *__restrict s1, const char *__restric= t s2, size_t sn, { lc_mbstowcs (collate_mbtowc, ws2, s2, n2); /* The sort key is a NUL-terminated byte string. */ - ret =3D LCMapStringW (collate_lcid, LCMAP_SORTKEY, ws2, -1, - (PWCHAR) s1, sn); + ret =3D LCMapStringEx (collate_locale, LCMAP_SORTKEY, ws2, -1, + (PWCHAR) s1, sn, NULL, NULL, 0); } if (ret =3D=3D 0) { @@ -1424,7 +1436,8 @@ strxfrm_l (char *__restrict s1, const char *__restric= t s2, size_t sn, if (!ws2 || GetLastError () !=3D ERROR_INSUFFICIENT_BUFFER) set_errno (EINVAL); else - ret =3D LCMapStringW (collate_lcid, LCMAP_SORTKEY, ws2, -1, NULL, 0); + ret =3D LCMapStringEx (collate_locale, LCMAP_SORTKEY, ws2, -1, + NULL, 0, NULL, NULL, 0); } if (ws2 && n2 > NT_MAX_PATH) free (ws2); @@ -1442,20 +1455,30 @@ strxfrm (char *__restrict s1, const char *__restric= t s2, size_t sn) /* Fetch default ANSI codepage from locale info and generate a setlocale compatible character set code. Called from newlib's setlocale(), if the charset isn't given explicitely in the POSIX compatible locale specifie= r. */ + +/* FIXME: Check all locales against their Linux counterpart again and + make sure the codeset conversion is correct. + FIXME: Perhaps, convert to locale names only. + FIXME: Perhaps, maintain a sorted list of Linux locales and their + default codesets. */ extern "C" void __set_charset_from_locale (const char *locale, char *charset) { UINT cp; - LCID lcid =3D __get_lcid_from_locale (locale); + wchar_t win_locale[ENCODING_LEN + 1]; + + int ret =3D __get_rfc5646_from_locale (locale, win_locale); wchar_t wbuf[9]; =20 /* "C" locale, or invalid locale? */ - if (lcid =3D=3D 0 || lcid =3D=3D (LCID) -1) + if (ret <=3D 0) cp =3D 20127; - else if (!GetLocaleInfoW (lcid, + else if (!GetLocaleInfoEx (win_locale, LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER, (PWCHAR) &cp, sizeof cp)) cp =3D 0; + /* For simplicity, we still convert to LCID here. */ + LCID lcid =3D LocaleNameToLCID (win_locale, 0); /* Translate codepage and lcid to a charset closely aligned with the def= ault charsets defined in Glibc. */ const char *cs; @@ -1481,11 +1504,12 @@ __set_charset_from_locale (const char *locale, char= *charset) cs =3D "BIG5"; break; case 1250: - if (lcid =3D=3D 0x181a /* sr_BA (Serbian Language/Bosnia + if (lcid =3D=3D 0x181a /* sr_BA (Serbian/Bosnia and Herzegovina) */ - || lcid =3D=3D 0x241a /* sr_RS (Serbian Language/Serbia) */ - || lcid =3D=3D 0x2c1a /* sr_ME (Serbian Language/Montenegro)*/ - || lcid =3D=3D 0x0442) /* tk_TM (Turkmen/Turkmenistan) */ + || lcid =3D=3D 0x241a /* sr_RS (Serbian/Serbia) */ + || lcid =3D=3D 0x2c1a /* sr_ME (Serbian/Montenegro)*/ + || lcid =3D=3D 0x0442 /* tk_TM (Turkmen/Turkmenistan) */ + || !wcscmp (win_locale, L"sr-Latn-XK")) /* (Serbian/Kosovo) */ cs =3D "UTF-8"; else if (lcid =3D=3D 0x041c) /* sq_AL (Albanian/Albania) */ cs =3D "ISO-8859-1"; @@ -1498,17 +1522,21 @@ __set_charset_from_locale (const char *locale, char= *charset) || lcid =3D=3D 0x281a /* sr_RS (Serbian Language/Serbia) */ || lcid =3D=3D 0x301a /* sr_ME (Serbian Language/Montenegro)*/ || lcid =3D=3D 0x0440 /* ky_KG (Kyrgyz/Kyrgyzstan) */ + || lcid =3D=3D 0x082c /* az_AZ@cyrillic (Azerbaijani/Azerbaijan) */ || lcid =3D=3D 0x0843 /* uz_UZ (Uzbek/Uzbekistan) */ /* tt_RU (Tatar/Russia), IQTElif alphabet */ || (lcid =3D=3D 0x0444 && has_modifier ("@iqtelif")) - || lcid =3D=3D 0x0450) /* mn_MN (Mongolian/Mongolia) */ + || lcid =3D=3D 0x0450 /* mn_MN (Mongolian/Mongolia) */ + || !wcscmp (win_locale, L"sr-Cyrl-XK")) /* (Serbian/Kosovo) */ cs =3D "UTF-8"; else if (lcid =3D=3D 0x0423) /* be_BY (Belarusian/Belarus) */ cs =3D has_modifier ("@latin") ? "UTF-8" : "CP1251"; - else if (lcid =3D=3D 0x0402) /* bg_BG (Bulgarian/Bulgaria) */ + else if (lcid =3D=3D 0x0402 /* bg_BG (Bulgarian/Bulgaria) */ + || lcid =3D=3D 0x0423) /* be_BY (Belarusian/Belarus) */ cs =3D "CP1251"; - else if (lcid =3D=3D 0x0422) /* uk_UA (Ukrainian/Ukraine) */ + else if (lcid =3D=3D 0x0422 /* uk_UA (Ukrainian/Ukraine) */ + || !wcscmp (win_locale, L"ru-UA")) /* (Russian/Ukraine) */ cs =3D "KOI8-U"; else if (lcid =3D=3D 0x0428) /* tg_TJ (Tajik/Tajikistan) */ cs =3D "KOI8-T"; @@ -1532,7 +1560,8 @@ __set_charset_from_locale (const char *locale, char *= charset) || lcid =3D=3D 0x0832 /* tn_BW (Tswana/Botswana) */ || lcid =3D=3D 0x0432 /* tn_ZA (Tswana/South Africa) */ || lcid =3D=3D 0x0488 /* wo_SN (Wolof/Senegal) */ - || lcid =3D=3D 0x046a) /* yo_NG (Yoruba/Nigeria) */ + || lcid =3D=3D 0x046a /* yo_NG (Yoruba/Nigeria) */ + || lcid =3D=3D 0x085f) /* ber_DZ (Tamazight/Algeria) */ cs =3D "UTF-8"; else if (lcid =3D=3D 0x042e) /* hsb_DE (Upper Sorbian/Germany) */ cs =3D "ISO-8859-2"; diff --git a/winsup/cygwin/regex/regcomp.c b/winsup/cygwin/regex/regcomp.c index 5acfd4e52e61..9e29257cb3a8 100644 --- a/winsup/cygwin/regex/regcomp.c +++ b/winsup/cygwin/regex/regcomp.c @@ -827,7 +827,7 @@ p_b_term(struct parse *p, cset *cs) CHadd(p, cs, start); else { #ifdef __CYGWIN__ - if (!__get_current_collate_locale ()->lcid) { + if (!__get_current_collate_locale ()->win_locale[0]) { #else if (__collate_load_error) { #endif