public inbox for newlib-cvs@sourceware.org
help / color / mirror / Atom feed
* [newlib-cygwin/main] Cygwin: convert Windows locale handling from LCID to ISO5646 strings
@ 2023-02-24 15:42 Corinna Vinschen
  0 siblings, 0 replies; only message in thread
From: Corinna Vinschen @ 2023-02-24 15:42 UTC (permalink / raw)
  To: cygwin-cvs, newlib-cvs

https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=e95a7a795522165a2d022dbb4df281d6be661c83

commit e95a7a795522165a2d022dbb4df281d6be661c83
Author:     Corinna Vinschen <corinna@vinschen.de>
AuthorDate: Fri Feb 24 16:37:44 2023 +0100
Commit:     Corinna Vinschen <corinna@vinschen.de>
CommitDate: Fri Feb 24 16:40:58 2023 +0100

    Cygwin: convert Windows locale handling from LCID to ISO5646 strings
    
    Since Windows Vista, locale handling is converted from using numeric
    locale identifiers (LCID) to using ISO5646 locale strings.  In the
    meantime Windows introduced new locales which don't even have a LCID
    attached.  Those were unusable in Cygwin because locale information
    for these locales required to call the new locale functions taking
    a locale string.
    
    Convert Cygwin to drop LCIDs and use Windows ISO5646 locales instead.
    
    The last place using LCIDs is the __set_charset_from_locale function.
    Checking numerically is easier and uslay faster than checking strings.
    However, this function is clearly a TODO

Diff:
---
 newlib/libc/locale/setlocale.h |   2 +-
 winsup/cygwin/glob.cc          |   2 +-
 winsup/cygwin/libc/fnmatch.c   |   2 +-
 winsup/cygwin/nlsfuncs.cc      | 449 ++++++++++++++++++++++-------------------
 winsup/cygwin/regex/regcomp.c  |   2 +-
 5 files changed, 243 insertions(+), 214 deletions(-)

diff --git a/newlib/libc/locale/setlocale.h b/newlib/libc/locale/setlocale.h
index 3530ec664dc2..7abf92f7ec2a 100644
--- a/newlib/libc/locale/setlocale.h
+++ b/newlib/libc/locale/setlocale.h
@@ -46,7 +46,7 @@ __BEGIN_DECLS
 #ifdef __CYGWIN__
 struct lc_collate_T
 {
-  __uint32_t	 lcid;
+  wchar_t	 win_locale[ENCODING_LEN + 1];
   int	       (*mbtowc) (struct _reent *, wchar_t *, const char *, size_t,
 			  mbstate_t *);
   char		 codeset[ENCODING_LEN + 1];
diff --git a/winsup/cygwin/glob.cc b/winsup/cygwin/glob.cc
index 64c3507d891e..63a07a930f06 100644
--- a/winsup/cygwin/glob.cc
+++ b/winsup/cygwin/glob.cc
@@ -929,7 +929,7 @@ match(Char *name, Char *pat, Char *patend)
 					if (M_COLL_P(pat[1]))
 						len2 = M_COLL_CNT(*++pat);
 #ifdef __CYGWIN__
-					if ((!__get_current_collate_locale ()->lcid) ?
+					if ((!__get_current_collate_locale ()->win_locale[0]) ?
 #else
 					if (__collate_load_error ?
 #endif
diff --git a/winsup/cygwin/libc/fnmatch.c b/winsup/cygwin/libc/fnmatch.c
index dfbe3863fbfa..beb67f01e5d1 100644
--- a/winsup/cygwin/libc/fnmatch.c
+++ b/winsup/cygwin/libc/fnmatch.c
@@ -330,7 +330,7 @@ rangematch(const char *pattern, wint_t test, int flags, char **newp,
 				c2 = towlower(c2);
 
 #ifdef __CYGWIN__
-			if ((!__get_current_collate_locale ()->lcid) ?
+			if ((!__get_current_collate_locale ()->win_locale[0]) ?
 #else
 			if (table->__collate_load_error ?
 #endif
diff --git a/winsup/cygwin/nlsfuncs.cc b/winsup/cygwin/nlsfuncs.cc
index 0d2764b05847..60bababd2668 100644
--- a/winsup/cygwin/nlsfuncs.cc
+++ b/winsup/cygwin/nlsfuncs.cc
@@ -25,128 +25,129 @@ details. */
 #define _LC(x)	&lc_##x##_ptr,lc_##x##_end-lc_##x##_ptr
 
 #define getlocaleinfo(category,type) \
-	    __getlocaleinfo(lcid,(type),_LC(category))
+	    __getlocaleinfo(win_locale,(type),_LC(category))
+#define getlocaleint(type) \
+	    __getlocaleint(win_locale,(type))
 #define setlocaleinfo(category,val) \
 	    __setlocaleinfo(_LC(category),(val))
 #define eval_datetimefmt(type,flags) \
-	    __eval_datetimefmt(lcid,(type),(flags),&lc_time_ptr,\
+	    __eval_datetimefmt(win_locale,(type),(flags),&lc_time_ptr,\
 			       lc_time_end-lc_time_ptr)
 #define charfromwchar(category,in) \
 	    __charfromwchar (_##category##_locale->in,_LC(category),f_wctomb)
 
 #define has_modifier(x)	((x)[0] && !strcmp (modifier, (x)))
 
-static char last_locale[ENCODING_LEN + 1];
-static LCID last_lcid;
-
-/* Fetch LCID from POSIX locale specifier.
+/* Fetch Windows RFC 5646 locale from POSIX locale specifier.
    Return values:
 
      -1: Invalid locale
       0: C or POSIX
-     >0: LCID
+      1: valid locale
 */
-static LCID
-__get_lcid_from_locale (const char *name)
+static int
+__get_rfc5646_from_locale (const char *name, wchar_t *win_locale)
 {
-  char locale[ENCODING_LEN + 1];
-  char *c;
-  LCID lcid;
-
-  /* Speed up reusing the same locale as before, for instance in LC_ALL case. */
-  if (!strcmp (name, last_locale))
-    {
-      debug_printf ("LCID=%04y", last_lcid);
-      return last_lcid;
-    }
-  stpcpy (last_locale, name);
-  stpcpy (locale, name);
-  /* Store modifier for later use. */
-  const char *modifier = strchr (last_locale, '@') ? : "";
+  wchar_t wlocale[ENCODING_LEN + 1] = { 0 };
+  wchar_t locale[ENCODING_LEN + 1];
+  wchar_t *c;
+
+  win_locale[0] = L'\0';
+  mbstowcs (locale, name, ENCODING_LEN + 1);
+  /* Remember modifier for later use. */
+  const char *modifier = strchr (name, '@') ? : "";
   /* Drop charset and modifier */
-  c = strchr (locale, '.');
+  c = wcschr (locale, L'.');
   if (!c)
-    c = strchr (locale, '@');
+    c = wcschr (locale, L'@');
   if (c)
-    *c = '\0';
+    *c = L'\0';
   /* "POSIX" already converted to "C" in loadlocale. */
-  if (!strcmp (locale, "C"))
-    return last_lcid = 0;
-  c = strchr (locale, '_');
+  if (!wcscmp (locale, L"C"))
+    return 0;
+  c = wcschr (locale, '_');
   if (!c)
-    return last_lcid = (LCID) -1;
-
-  wchar_t wlocale[ENCODING_LEN + 1];
+    {
+      /* try if the locale can be resolved from the language tag
+         fix up Linux-only locale first */
+      if (!wcscmp (locale, L"ber"))
+	wcscpy (locale, L"tzm");
+      if (ResolveLocaleName (locale, wlocale, ENCODING_LEN + 1) <= 0)
+	return -1;
+      wcpcpy (win_locale, wlocale);
+      return 1;
+    }
 
-  /* Convert to RFC 4646 syntax. */
+  /* Convert to RFC 5646 syntax. */
   *c = '-';
-  mbstowcs (wlocale, locale, ENCODING_LEN + 1);
-  lcid = LocaleNameToLCID (wlocale, 0);
-  /* Bug on Windows 10: LocaleNameToLCID returns LOCALE_CUSTOM_UNSPECIFIED
-     for unknown locales. */
-  if (lcid == 0 || lcid == LOCALE_CUSTOM_UNSPECIFIED)
+  /* Override a few locales with a different default script as used
+     on Linux.  Linux also supports no_NO which is equivalent to nb_NO,
+     but Windows can resolve that nicely.  Also, "tzm" and "zgh" are
+     subsumed under "ber" on Linux. */
+  struct {
+    const wchar_t *loc;
+    const wchar_t *wloc;
+  } override_locale[] = {
+    { L"ber-DZ" , L"tzm-Latn-DZ" },
+    { L"ber-MA" , L"zgh-Tfng-MA" },
+    { L"mn-CN" , L"mn-Mong-CN"   },
+    { L"mn-MN" , L"mn-Mong-MN"   },
+    { L"pa-PK" , L"pa-Arab-PK"   },
+    { L"sd-IN" , L"sd-Deva-IN"   },
+    { L"sr-BA" , L"sr-Cyrl-BA"   },
+    { L"sr-ME" , L"sr-Cyrl-ME"   },
+    { L"sr-RS" , L"sr-Cyrl-RS"   },
+    { L"sr-XK" , L"sr-Cyrl-XK"   },
+    { L"tzm-MA", L"tzm-Tfng-MA"  },
+    { NULL    , NULL	     }
+  };
+
+  for (int i = 0; override_locale[i].loc
+		  && override_locale[i].loc[0] <= locale[0]; ++i)
     {
-      /* Unfortunately there are a couple of locales for which no form
-	 without a Script part per RFC 4646 exists.
-	 Linux also supports no_NO which is equivalent to nb_NO. */
-      struct {
-	const char    *loc;
-	const wchar_t *wloc;
-      } sc_only_locale[] = {
-	{ "az-AZ" , L"az-Latn-AZ"  },
-	{ "bs-BA" , L"bs-Latn-BA"  },
-	{ "chr-US", L"chr-Cher-US"},
-	{ "ff-SN" , L"ff-Latn-SN"  },
-	{ "ha-NG" , L"ha-Latn-NG"  },
-	{ "iu-CA" , L"iu-Latn-CA"  },
-	{ "ks-IN" , L"ks-Arab-IN"  },
-	{ "ku-IQ" , L"ku-Arab-IQ"  },
-	{ "mn-CN" , L"mn-Mong-CN"  },
-	{ "mn-MN" , L"mn-Mong-MN"  },
-	{ "no-NO" , L"nb-NO"       },
-	{ "pa-PK" , L"pa-Arab-PK"  },
-	{ "quc-GT", L"quc-Latn-GT" },
-	{ "sd-PK" , L"sd-Arab-PK"  },
-	{ "sd-IN" , L"sd-Deva-IN"  },
-	{ "sr-BA" , L"sr-Cyrl-BA"  },
-	{ "sr-ME" , L"sr-Cyrl-ME"  },
-	{ "sr-RS" , L"sr-Cyrl-RS"  },
-	{ "tg-TJ" , L"tg-Cyrl-TJ"  },
-	{ "tzm-DZ", L"tzm-Latn-DZ" },
-	{ "tzm-MA", L"tzm-Tfng-MA" },
-	{ "uz-UZ" , L"uz-Latn-UZ"  },
-	{ NULL    , NULL	   }
-      };
-      for (int i = 0; sc_only_locale[i].loc
-		      && sc_only_locale[i].loc[0] <= locale[0]; ++i)
-	if (!strcmp (locale, sc_only_locale[i].loc))
-	  {
-	    lcid = LocaleNameToLCID (sc_only_locale[i].wloc, 0);
-	    if (!strncmp (locale, "sr-", 3))
-	      {
-		/* "@latin" modifier for the sr_XY locales changes
-		    collation behaviour so lcid should accommodate that
-		    by being set to the Latin sublang. */
-		if (lcid != 0 && lcid != LOCALE_CUSTOM_UNSPECIFIED
-		    && has_modifier ("@latin"))
-		  lcid = MAKELANGID (lcid & 0x3ff, (lcid >> 10) - 1);
-	      }
-	    else if (!strncmp (locale, "uz-", 3))
-	      {
-		/* Equivalent for "@cyrillic" modifier in uz_UZ locale */
-		if (lcid != 0 && lcid != LOCALE_CUSTOM_UNSPECIFIED
-		    && has_modifier ("@cyrillic"))
-		  lcid = MAKELANGID (lcid & 0x3ff, (lcid >> 10) + 1);
-	      }
-	    break;
-	  }
+      if (!wcscmp (locale, override_locale[i].loc))
+	{
+	  wcscpy (wlocale, override_locale[i].wloc);
+	  break;
+	}
+    }
+  if (!wlocale[0]
+      && ResolveLocaleName (locale, wlocale, ENCODING_LEN + 1) <= 1)
+    return -1;
+
+  /* Check for modifiers changing the script */
+  const wchar_t *iso15924_script[] = { L"Latn-", L"Cyrl-", L"Deva-", L"Adlm-" };
+  int idx = -1;
+
+  if (modifier[0])
+    {
+      if (!strcmp (++modifier, "latin"))
+	idx = 0;
+      else if (!strcmp (modifier, "cyrillic"))
+	idx = 1;
+      else if (!strcmp (modifier, "devanagari"))
+	idx = 2;
+      else if (!strcmp (modifier, "adlam"))
+	idx = 3;
+    }
+  if (idx >= 0)
+    {
+      wchar_t *iso3166 = wcschr (wlocale, L'-') + 1;
+      wchar_t *wlp;
+
+      /* Copy iso639 language part including dash */
+      wlp = wcpncpy (win_locale, wlocale, iso3166 - wlocale);
+      /* Concat new iso15924 script */
+      wlp = wcpcpy (wlp, iso15924_script[idx]);
+      /* Concat iso3166 territory.  Skip script, if already in the locale */
+      wchar_t *skip_script = wcschr (iso3166, L'-');
+      if (skip_script)
+	iso3166 = skip_script + 1;
+       wcpcpy (wlp, iso3166);
     }
-  if (lcid && lcid != LOCALE_CUSTOM_UNSPECIFIED)
-    last_lcid = lcid;
   else
-    last_lcid = (LCID) -1;
-  debug_printf ("LCID=%04y", last_lcid);
-  return last_lcid;
+    wcpcpy (win_locale, wlocale);
+  return 1;
 }
 
 /* Never returns -1.  Just skips invalid chars instead.  Only if return_invalid
@@ -257,7 +258,7 @@ rebase_locale_buf (const void *ptrv, const void *ptrvend, const char *newbase,
 }
 
 static wchar_t *
-__getlocaleinfo (LCID lcid, LCTYPE type, char **ptr, size_t size)
+__getlocaleinfo (wchar_t *loc, LCTYPE type, char **ptr, size_t size)
 {
   size_t num;
   wchar_t *ret;
@@ -265,7 +266,7 @@ __getlocaleinfo (LCID lcid, LCTYPE type, char **ptr, size_t size)
   if ((uintptr_t) *ptr % 1)
     ++*ptr;
   ret = (wchar_t *) *ptr;
-  num = GetLocaleInfoW (lcid, type, ret, size / sizeof (wchar_t));
+  num = GetLocaleInfoEx (loc, type, ret, size / sizeof (wchar_t));
   *ptr = (char *) (ret + num);
   return ret;
 }
@@ -296,10 +297,10 @@ __charfromwchar (const wchar_t *in, char **ptr, size_t size, wctomb_p f_wctomb)
 }
 
 static UINT
-getlocaleint (LCID lcid, LCTYPE type)
+__getlocaleint (wchar_t *loc, LCTYPE type)
 {
   UINT val;
-  return GetLocaleInfoW (lcid, type | LOCALE_RETURN_NUMBER, (PWCHAR) &val,
+  return GetLocaleInfoEx (loc, type | LOCALE_RETURN_NUMBER, (PWCHAR) &val,
 			 sizeof val) ? val : 0;
 }
 
@@ -310,7 +311,7 @@ enum dt_flags {
 };
 
 static wchar_t *
-__eval_datetimefmt (LCID lcid, LCTYPE type, dt_flags flags, char **ptr,
+__eval_datetimefmt (wchar_t *loc, LCTYPE type, dt_flags flags, char **ptr,
 		    size_t size)
 {
   wchar_t buf[80];
@@ -327,7 +328,7 @@ __eval_datetimefmt (LCID lcid, LCTYPE type, dt_flags flags, char **ptr,
     ++*ptr;
   wchar_t *ret = (wchar_t *) *ptr;
   wchar_t *p = (wchar_t *) *ptr;
-  GetLocaleInfoW (lcid, type, buf, 80);
+  GetLocaleInfoEx (loc, type, buf, 80);
   for (wchar_t *fmt = buf; *fmt; ++fmt)
     switch (fc = *fmt)
       {
@@ -390,20 +391,21 @@ __eval_datetimefmt (LCID lcid, LCTYPE type, dt_flags flags, char **ptr,
 
 /* Convert Windows grouping format into POSIX grouping format. */
 static char *
-conv_grouping (LCID lcid, LCTYPE type, char **lc_ptr)
+conv_grouping (wchar_t *loc, LCTYPE type, char **lc_ptr)
 {
-  char buf[10]; /* Per MSDN max size of LOCALE_SGROUPING element incl. NUL */
+  wchar_t buf[10]; /* Per MSDN max size of LOCALE_SGROUPING element incl. NUL */
   bool repeat = false;
   char *ptr = *lc_ptr;
   char *ret = ptr;
 
-  GetLocaleInfoA (lcid, type, buf, 10);
-  /* Convert Windows grouping format into POSIX grouping format. */
-  for (char *c = buf; *c; ++c)
+  GetLocaleInfoEx (loc, type, buf, 10);
+  /* Convert Windows grouping format into POSIX grouping format. Note that
+     only ASCII chars are used in the grouping format. */
+  for (wchar_t *c = buf; *c; ++c)
     {
-      if (*c < '0' || *c > '9')
+      if (*c < L'0' || *c > L'9')
 	continue;
-      char val = *c - '0';
+      char val = *c - L'0';
       if (!val)
 	{
 	  repeat = true;
@@ -429,10 +431,11 @@ __set_lc_time_from_win (const char *name,
 			char **lc_time_buf, wctomb_p f_wctomb,
 			const char *charset)
 {
-  LCID lcid = __get_lcid_from_locale (name);
-  if (lcid == (LCID) -1)
-    return lcid;
-  if (!lcid && !strcmp (charset, "ASCII"))
+  wchar_t win_locale[ENCODING_LEN + 1];
+  int ret = __get_rfc5646_from_locale (name, win_locale);
+  if (ret < 0)
+    return ret;
+  if (!ret && !strcmp (charset, "ASCII"))
     return 0;
 
 # define MAX_TIME_BUFFER_SIZE	4096
@@ -445,17 +448,17 @@ __set_lc_time_from_win (const char *name,
   char *lc_time_ptr = new_lc_time_buf;
 
   /* C.foo is just a copy of "C" with fixed charset. */
-  if (!lcid)
+  if (!ret)
     memcpy (_time_locale, _C_time_locale, sizeof (struct lc_time_T));
   /* codeset */
   _time_locale->codeset = lc_time_ptr;
   lc_time_ptr = stpcpy (lc_time_ptr, charset) + 1;
 
-  if (lcid)
+  if (ret)
     {
       char locale[ENCODING_LEN + 1];
       strcpy (locale, name);
-      /* Removes the charset from the locale and attach the modifer to the
+      /* Removes the charset from the locale and attach the modifier to the
 	 language_TERRITORY part. */
       char *c = strchr (locale, '.');
       if (c)
@@ -476,17 +479,22 @@ __set_lc_time_from_win (const char *name,
 					    sizeof *lc_era, locale_cmp);
 
       /* mon */
-      /* Windows has a bug in Japanese and Korean locales.  In these
-	 locales, strings returned for LOCALE_SABBREVMONTHNAME* are missing
-	 the suffix representing a month.  Unfortunately this is not
-	 documented in English.  A Japanese article describing the problem
-	 is http://msdn.microsoft.com/ja-jp/library/cc422084.aspx
+      /* Windows has a bug in "ja-JP" and "ko-KR" (but not in "ko-KP").
+         In these locales, strings returned for LOCALE_SABBREVMONTHNAME*
+	 are missing the suffix representing a month.
+
+	 A Japanese article describing the problem was
+	 https://msdn.microsoft.com/ja-jp/library/cc422084.aspx, which is
+	 only available via
+	 https://web.archive.org/web/20110922195821/https://msdn.microsoft.com/ja-jp/library/cc422084.aspx
+	 these days.  Testing indicates that this problem is still present
+	 in Windows 11.
+
 	 The workaround is to use LOCALE_SMONTHNAME* in these locales,
 	 even for the abbreviated month name. */
-      const LCTYPE mon_base =
-		lcid == MAKELANGID (LANG_JAPANESE, SUBLANG_JAPANESE_JAPAN)
-		|| lcid == MAKELANGID (LANG_KOREAN, SUBLANG_KOREAN)
-		? LOCALE_SMONTHNAME1 : LOCALE_SABBREVMONTHNAME1;
+      const LCTYPE mon_base = !wcscmp (win_locale, L"ja-JP")
+			      || !wcscmp (win_locale, L"ko-KR")
+			      ? LOCALE_SMONTHNAME1 : LOCALE_SABBREVMONTHNAME1;
       for (int i = 0; i < 12; ++i)
 	{
 	  _time_locale->wmon[i] = getlocaleinfo (time, mon_base + i);
@@ -495,7 +503,8 @@ __set_lc_time_from_win (const char *name,
       /* month and alt_month */
       for (int i = 0; i < 12; ++i)
 	{
-	  _time_locale->wmonth[i] = getlocaleinfo (time, LOCALE_SMONTHNAME1 + i);
+	  _time_locale->wmonth[i] = getlocaleinfo (time,
+						   LOCALE_SMONTHNAME1 + i);
 	  _time_locale->month[i] = _time_locale->alt_month[i]
 				 = charfromwchar (time, wmonth[i]);
 	}
@@ -570,7 +579,7 @@ __set_lc_time_from_win (const char *name,
       /* md */
       {
 	wchar_t buf[80];
-	GetLocaleInfoW (lcid, LOCALE_IDATE, buf, 80);
+	GetLocaleInfoEx (win_locale, LOCALE_IDATE, buf, 80);
 	_time_locale->md_order = (const char *) lc_time_ptr;
 	lc_time_ptr = stpcpy (lc_time_ptr, *buf == L'1' ? "dm" : "md") + 1;
       }
@@ -690,10 +699,11 @@ __set_lc_ctype_from_win (const char *name,
 			 char **lc_ctype_buf, wctomb_p f_wctomb,
 			 const char *charset, int mb_cur_max)
 {
-  LCID lcid = __get_lcid_from_locale (name);
-  if (lcid == (LCID) -1)
-    return lcid;
-  if (!lcid && !strcmp (charset, "ASCII"))
+  wchar_t win_locale[ENCODING_LEN + 1];
+  int ret = __get_rfc5646_from_locale (name, win_locale);
+  if (ret < 0)
+    return ret;
+  if (!ret && !strcmp (charset, "ASCII"))
     return 0;
 
 # define MAX_CTYPE_BUFFER_SIZE	256
@@ -704,7 +714,7 @@ __set_lc_ctype_from_win (const char *name,
     return -1;
   char *lc_ctype_ptr = new_lc_ctype_buf;
   /* C.foo is just a copy of "C" with fixed charset. */
-  if (!lcid)
+  if (!ret)
     memcpy (_ctype_locale, _C_ctype_locale, sizeof (struct lc_ctype_T));
   /* codeset */
   _ctype_locale->codeset = lc_ctype_ptr;
@@ -713,11 +723,11 @@ __set_lc_ctype_from_win (const char *name,
   _ctype_locale->mb_cur_max = lc_ctype_ptr;
   *lc_ctype_ptr++ = mb_cur_max;
   *lc_ctype_ptr++ = '\0';
-  if (lcid)
+  if (ret)
     {
       /* outdigits and woutdigits */
       wchar_t digits[11];
-      GetLocaleInfoW (lcid, LOCALE_SNATIVEDIGITS, digits, 11);
+      GetLocaleInfoEx (win_locale, LOCALE_SNATIVEDIGITS, digits, 11);
       for (int i = 0; i <= 9; ++i)
 	{
 	  mbstate_t state;
@@ -762,10 +772,11 @@ __set_lc_numeric_from_win (const char *name,
 			   char **lc_numeric_buf, wctomb_p f_wctomb,
 			   const char *charset)
 {
-  LCID lcid = __get_lcid_from_locale (name);
-  if (lcid == (LCID) -1)
-    return lcid;
-  if (!lcid && !strcmp (charset, "ASCII"))
+  wchar_t win_locale[ENCODING_LEN + 1];
+  int ret = __get_rfc5646_from_locale (name, win_locale);
+  if (ret < 0)
+    return ret;
+  if (!ret && !strcmp (charset, "ASCII"))
     return 0;
 
 # define MAX_NUMERIC_BUFFER_SIZE	256
@@ -777,20 +788,20 @@ __set_lc_numeric_from_win (const char *name,
     return -1;
   char *lc_numeric_ptr = new_lc_numeric_buf;
   /* C.foo is just a copy of "C" with fixed charset. */
-  if (!lcid)
+  if (!ret)
     memcpy (_numeric_locale, _C_numeric_locale, sizeof (struct lc_numeric_T));
   else
     {
       /* decimal_point and thousands_sep */
-      if (lcid == 0x0429)	/* fa_IR.  Windows decimal_point is slash,
-					   correct is dot */
+      /* fa_IR.  Windows decimal_point is slash, correct is dot */
+      if (!wcscmp (win_locale, L"fa-IR"))
 	{
 	  _numeric_locale->wdecimal_point = setlocaleinfo (numeric, L'.');
 	  _numeric_locale->wthousands_sep = setlocaleinfo (numeric, L',');
 	}
-      else if (lcid == 0x0463)	/* ps_AF.  Windows decimal_point is dot,
-					   thousands_sep is comma, correct are
-					   arabic separators. */
+      /* ps_AF.  Windows decimal_point is dot, thousands_sep is comma,
+		 correct are arabic separators. */
+      else if (!wcscmp (win_locale, L"ps-AF"))
 	{
 	  _numeric_locale->wdecimal_point = setlocaleinfo (numeric, 0x066b);
 	  _numeric_locale->wthousands_sep = setlocaleinfo (numeric, 0x066c);
@@ -805,7 +816,7 @@ __set_lc_numeric_from_win (const char *name,
       _numeric_locale->decimal_point = charfromwchar (numeric, wdecimal_point);
       _numeric_locale->thousands_sep = charfromwchar (numeric, wthousands_sep);
       /* grouping */
-      _numeric_locale->grouping = conv_grouping (lcid, LOCALE_SGROUPING,
+      _numeric_locale->grouping = conv_grouping (win_locale, LOCALE_SGROUPING,
 						 &lc_numeric_ptr);
     }
   /* codeset */
@@ -837,10 +848,11 @@ __set_lc_monetary_from_win (const char *name,
 			    char **lc_monetary_buf, wctomb_p f_wctomb,
 			    const char *charset)
 {
-  LCID lcid = __get_lcid_from_locale (name);
-  if (lcid == (LCID) -1)
-    return lcid;
-  if (!lcid && !strcmp (charset, "ASCII"))
+  wchar_t win_locale[ENCODING_LEN + 1];
+  int ret = __get_rfc5646_from_locale (name, win_locale);
+  if (ret < 0)
+    return ret;
+  if (!ret && !strcmp (charset, "ASCII"))
     return 0;
 
 # define MAX_MONETARY_BUFFER_SIZE	512
@@ -852,7 +864,7 @@ __set_lc_monetary_from_win (const char *name,
     return -1;
   char *lc_monetary_ptr = new_lc_monetary_buf;
   /* C.foo is just a copy of "C" with fixed charset. */
-  if (!lcid)
+  if (!ret)
     memcpy (_monetary_locale, _C_monetary_locale, sizeof (struct lc_monetary_T));
   else
     {
@@ -881,11 +893,11 @@ __set_lc_monetary_from_win (const char *name,
 	_monetary_locale->currency_symbol = charfromwchar (monetary,
 							   wcurrency_symbol);
       /* mon_decimal_point and mon_thousands_sep */
-      if (lcid == 0x0429 || lcid == 0x0463)	/* fa_IR or ps_AF.  Windows
-						   mon_decimal_point is slash
-						   and comma, mon_thousands_sep
-						   is comma and dot, correct
-						   are arabic separators. */
+      /* fa_IR or ps_AF.  Windows mon_decimal_point is slash and comma,
+			  mon_thousands_sep is comma and dot, correct
+			  are arabic separators. */
+      if (!wcscmp (win_locale, L"fa-IR")
+	  || !wcscmp (win_locale, L"ps-AF"))
 	{
 	  _monetary_locale->wmon_decimal_point = setlocaleinfo (monetary,
 								0x066b);
@@ -904,7 +916,8 @@ __set_lc_monetary_from_win (const char *name,
       _monetary_locale->mon_thousands_sep = charfromwchar (monetary,
 							   wmon_thousands_sep);
       /* mon_grouping */
-      _monetary_locale->mon_grouping = conv_grouping (lcid, LOCALE_SMONGROUPING,
+      _monetary_locale->mon_grouping = conv_grouping (win_locale,
+						      LOCALE_SMONGROUPING,
 						      &lc_monetary_ptr);
       /* positive_sign */
       _monetary_locale->wpositive_sign = getlocaleinfo (monetary,
@@ -915,33 +928,33 @@ __set_lc_monetary_from_win (const char *name,
 							LOCALE_SNEGATIVESIGN);
       _monetary_locale->negative_sign = charfromwchar (monetary, wnegative_sign);
       /* int_frac_digits */
-      *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IINTLCURRDIGITS);
+      *lc_monetary_ptr = (char) getlocaleint (LOCALE_IINTLCURRDIGITS);
       _monetary_locale->int_frac_digits = lc_monetary_ptr++;
       /* frac_digits */
-      *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_ICURRDIGITS);
+      *lc_monetary_ptr = (char) getlocaleint (LOCALE_ICURRDIGITS);
       _monetary_locale->frac_digits = lc_monetary_ptr++;
       /* p_cs_precedes and int_p_cs_precedes */
-      *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IPOSSYMPRECEDES);
+      *lc_monetary_ptr = (char) getlocaleint (LOCALE_IPOSSYMPRECEDES);
       _monetary_locale->p_cs_precedes
 	    = _monetary_locale->int_p_cs_precedes = lc_monetary_ptr++;
       /* p_sep_by_space and int_p_sep_by_space */
-      *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IPOSSEPBYSPACE);
+      *lc_monetary_ptr = (char) getlocaleint (LOCALE_IPOSSEPBYSPACE);
       _monetary_locale->p_sep_by_space
 	    = _monetary_locale->int_p_sep_by_space = lc_monetary_ptr++;
       /* n_cs_precedes and int_n_cs_precedes */
-      *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_INEGSYMPRECEDES);
+      *lc_monetary_ptr = (char) getlocaleint (LOCALE_INEGSYMPRECEDES);
       _monetary_locale->n_cs_precedes
 	    = _monetary_locale->int_n_cs_precedes = lc_monetary_ptr++;
       /* n_sep_by_space and int_n_sep_by_space */
-      *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_INEGSEPBYSPACE);
+      *lc_monetary_ptr = (char) getlocaleint (LOCALE_INEGSEPBYSPACE);
       _monetary_locale->n_sep_by_space
 	    = _monetary_locale->int_n_sep_by_space = lc_monetary_ptr++;
       /* p_sign_posn and int_p_sign_posn */
-      *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_IPOSSIGNPOSN);
+      *lc_monetary_ptr = (char) getlocaleint (LOCALE_IPOSSIGNPOSN);
       _monetary_locale->p_sign_posn
 	    = _monetary_locale->int_p_sign_posn = lc_monetary_ptr++;
       /* n_sign_posn and int_n_sign_posn */
-      *lc_monetary_ptr = (char) getlocaleint (lcid, LOCALE_INEGSIGNPOSN);
+      *lc_monetary_ptr = (char) getlocaleint (LOCALE_INEGSIGNPOSN);
       _monetary_locale->n_sign_posn
 	    = _monetary_locale->int_n_sign_posn = lc_monetary_ptr++;
     }
@@ -970,10 +983,11 @@ __set_lc_messages_from_win (const char *name,
 			    char **lc_messages_buf,
 			    wctomb_p f_wctomb, const char *charset)
 {
-  LCID lcid = __get_lcid_from_locale (name);
-  if (lcid == (LCID) -1)
-    return lcid;
-  if (!lcid && !strcmp (charset, "ASCII"))
+  wchar_t win_locale[ENCODING_LEN + 1];
+  int ret = __get_rfc5646_from_locale (name, win_locale);
+  if (ret < 0)
+    return ret;
+  if (!ret && !strcmp (charset, "ASCII"))
     return 0;
 
   char locale[ENCODING_LEN + 1];
@@ -981,7 +995,7 @@ __set_lc_messages_from_win (const char *name,
   lc_msg_t *msg = NULL;
 
   /* C.foo is just a copy of "C" with fixed charset. */
-  if (!lcid)
+  if (!ret)
     memcpy (_messages_locale, _C_messages_locale, sizeof (struct lc_messages_T));
   else
     {
@@ -1012,7 +1026,7 @@ __set_lc_messages_from_win (const char *name,
      target charset are simply ignored, as on Linux. */
   size_t len = 0;
   len += (strlen (charset) + 1);
-  if (lcid)
+  if (ret)
     {
       len += lc_wcstombs (f_wctomb, NULL, msg->yesexpr, 0) + 1;
       len += lc_wcstombs (f_wctomb, NULL, msg->noexpr, 0) + 1;
@@ -1036,7 +1050,7 @@ __set_lc_messages_from_win (const char *name,
   /* codeset */
   _messages_locale->codeset = c;
   c = stpcpy (c, charset) + 1;
-  if (lcid)
+  if (ret)
     {
       _messages_locale->yesexpr = (const char *) c;
       len = lc_wcstombs (f_wctomb, c, msg->yesexpr, lc_messages_end - c);
@@ -1065,7 +1079,7 @@ __set_lc_messages_from_win (const char *name,
 
 const struct lc_collate_T _C_collate_locale =
 {
-  0,
+  L"",
   __ascii_mbtowc,
   "ASCII"
 };
@@ -1080,10 +1094,11 @@ __collate_load_locale (struct __locale_t *locale, const char *name,
   char *bufp = NULL;
   struct lc_collate_T *cop = NULL;
 
-  LCID lcid = __get_lcid_from_locale (name);
-  if (lcid == (LCID) -1)
-    return -1;
-  if (lcid)
+  wchar_t win_locale[ENCODING_LEN + 1];
+  int ret = __get_rfc5646_from_locale (name, win_locale);
+  if (ret < 0)
+    return ret;
+  if (ret)
     {
       bufp = (char *) malloc (1);	/* dummy */
       if (!bufp)
@@ -1094,12 +1109,12 @@ __collate_load_locale (struct __locale_t *locale, const char *name,
 	  free (bufp);
 	  return -1;
 	}
-      cop->lcid = lcid;
+      wcscpy (cop->win_locale, win_locale);
       cop->mbtowc = (mbtowc_p) f_mbtowc;
       stpcpy (cop->codeset, charset);
     }
   struct __lc_cats tmp = locale->lc_cat[LC_COLLATE];
-  locale->lc_cat[LC_COLLATE].ptr = lcid == 0 ? &_C_collate_locale : cop;
+  locale->lc_cat[LC_COLLATE].ptr = !win_locale[0] ? &_C_collate_locale : cop;
   locale->lc_cat[LC_COLLATE].buf = bufp;
   /* If buf is not NULL, both pointers have been alloc'ed */
   if (tmp.buf)
@@ -1119,11 +1134,11 @@ wcscoll_l (const wchar_t *__restrict ws1, const wchar_t *__restrict ws2,
 	   struct __locale_t *locale)
 {
   int ret;
-  LCID collate_lcid = __get_collate_locale (locale)->lcid;
+  const wchar_t *collate_locale = __get_collate_locale (locale)->win_locale;
 
-  if (!collate_lcid)
+  if (!collate_locale[0])
     return wcscmp (ws1, ws2);
-  ret = CompareStringW (collate_lcid, 0, ws1, -1, ws2, -1);
+  ret = CompareStringEx (collate_locale, 0, ws1, -1, ws2, -1, NULL, NULL, 0);
   if (!ret)
     set_errno (EINVAL);
   return ret - CSTR_EQUAL;
@@ -1143,12 +1158,10 @@ strcoll_l (const char *__restrict s1, const char *__restrict s2,
   wchar_t *ws1, *ws2;
   tmp_pathbuf tp;
   int ret;
-  LCID collate_lcid = __get_collate_locale (locale)->lcid;
+  const wchar_t *collate_locale = __get_collate_locale (locale)->win_locale;
 
-  if (!collate_lcid)
+  if (!collate_locale[0])
     return strcmp (s1, s2);
-  /* The ANSI version of CompareString uses the default charset of the lcid,
-     so we must use the Unicode version. */
   mbtowc_p collate_mbtowc = __get_collate_locale (locale)->mbtowc;
   n1 = lc_mbstowcs (collate_mbtowc, NULL, s1, 0) + 1;
   ws1 = (n1 > NT_MAX_PATH ? (wchar_t *) malloc (n1 * sizeof (wchar_t))
@@ -1158,7 +1171,7 @@ strcoll_l (const char *__restrict s1, const char *__restrict s2,
   ws2 = (n2 > NT_MAX_PATH ? (wchar_t *) malloc (n2 * sizeof (wchar_t))
 			  : tp.w_get ());
   lc_mbstowcs (collate_mbtowc, ws2, s2, n2);
-  ret = CompareStringW (collate_lcid, 0, ws1, -1, ws2, -1);
+  ret = CompareStringEx (collate_locale, 0, ws1, -1, ws2, -1, NULL, NULL, 0);
   if (n1 > NT_MAX_PATH)
     free (ws1);
   if (n2 > NT_MAX_PATH)
@@ -1350,15 +1363,15 @@ wcsxfrm_l (wchar_t *__restrict ws1, const wchar_t *__restrict ws2, size_t wsn,
 	   struct __locale_t *locale)
 {
   size_t ret;
-  LCID collate_lcid = __get_collate_locale (locale)->lcid;
+  const wchar_t *collate_locale = __get_collate_locale (locale)->win_locale;
 
-  if (!collate_lcid)
+  if (!collate_locale[0])
     return wcslcpy (ws1, ws2, wsn);
   /* Don't use LCMAP_SORTKEY in conjunction with LCMAP_BYTEREV.  The cchDest
      parameter is used as byte count with LCMAP_SORTKEY but as char count with
      LCMAP_BYTEREV. */
-  ret = LCMapStringW (collate_lcid, LCMAP_SORTKEY, ws2, -1, ws1,
-		      wsn * sizeof (wchar_t));
+  ret = LCMapStringEx (collate_locale, LCMAP_SORTKEY, ws2, -1, ws1,
+		       wsn * sizeof (wchar_t), NULL, NULL, 0);
   if (ret)
     {
       ret /= sizeof (wchar_t);
@@ -1380,7 +1393,8 @@ wcsxfrm_l (wchar_t *__restrict ws1, const wchar_t *__restrict ws2, size_t wsn,
     set_errno (EINVAL);
   else
     {
-      ret = LCMapStringW (collate_lcid, LCMAP_SORTKEY, ws2, -1, NULL, 0);
+      ret = LCMapStringEx (collate_locale, LCMAP_SORTKEY, ws2, -1,
+			   NULL, 0, NULL, NULL, 0);
       if (ret)
 	wsn = ret / sizeof (wchar_t);
     }
@@ -1401,12 +1415,10 @@ strxfrm_l (char *__restrict s1, const char *__restrict s2, size_t sn,
   size_t n2;
   wchar_t *ws2;
   tmp_pathbuf tp;
-  LCID collate_lcid = __get_collate_locale (locale)->lcid;
+  const wchar_t *collate_locale = __get_collate_locale (locale)->win_locale;
 
-  if (!collate_lcid)
+  if (!collate_locale[0])
     return strlcpy (s1, s2, sn);
-  /* The ANSI version of LCMapString uses the default charset of the lcid,
-     so we must use the Unicode version. */
   mbtowc_p collate_mbtowc = __get_collate_locale (locale)->mbtowc;
   n2 = lc_mbstowcs (collate_mbtowc, NULL, s2, 0) + 1;
   ws2 = (n2 > NT_MAX_PATH ? (wchar_t *) malloc (n2 * sizeof (wchar_t))
@@ -1415,8 +1427,8 @@ strxfrm_l (char *__restrict s1, const char *__restrict s2, size_t sn,
     {
       lc_mbstowcs (collate_mbtowc, ws2, s2, n2);
       /* The sort key is a NUL-terminated byte string. */
-      ret = LCMapStringW (collate_lcid, LCMAP_SORTKEY, ws2, -1,
-			  (PWCHAR) s1, sn);
+      ret = LCMapStringEx (collate_locale, LCMAP_SORTKEY, ws2, -1,
+			  (PWCHAR) s1, sn, NULL, NULL, 0);
     }
   if (ret == 0)
     {
@@ -1424,7 +1436,8 @@ strxfrm_l (char *__restrict s1, const char *__restrict s2, size_t sn,
       if (!ws2 || GetLastError () != ERROR_INSUFFICIENT_BUFFER)
 	set_errno (EINVAL);
       else
-	ret = LCMapStringW (collate_lcid, LCMAP_SORTKEY, ws2, -1, NULL, 0);
+	ret = LCMapStringEx (collate_locale, LCMAP_SORTKEY, ws2, -1,
+			     NULL, 0, NULL, NULL, 0);
     }
   if (ws2 && n2 > NT_MAX_PATH)
     free (ws2);
@@ -1442,20 +1455,30 @@ strxfrm (char *__restrict s1, const char *__restrict s2, size_t sn)
 /* Fetch default ANSI codepage from locale info and generate a setlocale
    compatible character set code.  Called from newlib's setlocale(), if the
    charset isn't given explicitely in the POSIX compatible locale specifier. */
+
+/* FIXME: Check all locales against their Linux counterpart again and
+	  make sure the codeset conversion is correct.
+   FIXME: Perhaps, convert to locale names only.
+   FIXME: Perhaps, maintain a sorted list of Linux locales and their
+	  default codesets. */
 extern "C" void
 __set_charset_from_locale (const char *locale, char *charset)
 {
   UINT cp;
-  LCID lcid = __get_lcid_from_locale (locale);
+  wchar_t win_locale[ENCODING_LEN + 1];
+
+  int ret = __get_rfc5646_from_locale (locale, win_locale);
   wchar_t wbuf[9];
 
   /* "C" locale, or invalid locale? */
-  if (lcid == 0 || lcid == (LCID) -1)
+  if (ret <= 0)
     cp = 20127;
-  else if (!GetLocaleInfoW (lcid,
+  else if (!GetLocaleInfoEx (win_locale,
 			    LOCALE_IDEFAULTANSICODEPAGE | LOCALE_RETURN_NUMBER,
 			    (PWCHAR) &cp, sizeof cp))
     cp = 0;
+  /* For simplicity, we still convert to LCID here. */
+  LCID lcid = LocaleNameToLCID (win_locale, 0);
   /* Translate codepage and lcid to a charset closely aligned with the default
      charsets defined in Glibc. */
   const char *cs;
@@ -1481,11 +1504,12 @@ __set_charset_from_locale (const char *locale, char *charset)
       cs = "BIG5";
       break;
     case 1250:
-      if (lcid == 0x181a		/* sr_BA (Serbian Language/Bosnia
+      if (lcid == 0x181a		/* sr_BA (Serbian/Bosnia
 						  and Herzegovina) */
-	  || lcid == 0x241a		/* sr_RS (Serbian Language/Serbia) */
-	  || lcid == 0x2c1a		/* sr_ME (Serbian Language/Montenegro)*/
-	  || lcid == 0x0442)		/* tk_TM (Turkmen/Turkmenistan) */
+	  || lcid == 0x241a		/* sr_RS (Serbian/Serbia) */
+	  || lcid == 0x2c1a		/* sr_ME (Serbian/Montenegro)*/
+	  || lcid == 0x0442		/* tk_TM (Turkmen/Turkmenistan) */
+	  || !wcscmp (win_locale, L"sr-Latn-XK")) /* (Serbian/Kosovo) */
 	cs = "UTF-8";
       else if (lcid == 0x041c)		/* sq_AL (Albanian/Albania) */
 	cs = "ISO-8859-1";
@@ -1498,17 +1522,21 @@ __set_charset_from_locale (const char *locale, char *charset)
 	  || lcid == 0x281a		/* sr_RS (Serbian Language/Serbia) */
 	  || lcid == 0x301a		/* sr_ME (Serbian Language/Montenegro)*/
 	  || lcid == 0x0440		/* ky_KG (Kyrgyz/Kyrgyzstan) */
+	  || lcid == 0x082c		/* az_AZ@cyrillic (Azerbaijani/Azerbaijan) */
 	  || lcid == 0x0843		/* uz_UZ (Uzbek/Uzbekistan) */
 					/* tt_RU (Tatar/Russia),
 						 IQTElif alphabet */
 	  || (lcid == 0x0444 && has_modifier ("@iqtelif"))
-	  || lcid == 0x0450)		/* mn_MN (Mongolian/Mongolia) */
+	  || lcid == 0x0450		/* mn_MN (Mongolian/Mongolia) */
+	  || !wcscmp (win_locale, L"sr-Cyrl-XK")) /* (Serbian/Kosovo) */
 	cs = "UTF-8";
       else if (lcid == 0x0423)		/* be_BY (Belarusian/Belarus) */
 	cs = has_modifier ("@latin") ? "UTF-8" : "CP1251";
-      else if (lcid == 0x0402)		/* bg_BG (Bulgarian/Bulgaria) */
+      else if (lcid == 0x0402		/* bg_BG (Bulgarian/Bulgaria) */
+	       || lcid == 0x0423)	/* be_BY (Belarusian/Belarus) */
 	cs = "CP1251";
-      else if (lcid == 0x0422)		/* uk_UA (Ukrainian/Ukraine) */
+      else if (lcid == 0x0422		/* uk_UA (Ukrainian/Ukraine) */
+	      || !wcscmp (win_locale, L"ru-UA")) /* (Russian/Ukraine) */
 	cs = "KOI8-U";
       else if (lcid == 0x0428)		/* tg_TJ (Tajik/Tajikistan) */
 	cs = "KOI8-T";
@@ -1532,7 +1560,8 @@ __set_charset_from_locale (const char *locale, char *charset)
 	       || lcid == 0x0832	/* tn_BW (Tswana/Botswana) */
 	       || lcid == 0x0432	/* tn_ZA (Tswana/South Africa) */
 	       || lcid == 0x0488	/* wo_SN (Wolof/Senegal) */
-	       || lcid == 0x046a)	/* yo_NG (Yoruba/Nigeria) */
+	       || lcid == 0x046a	/* yo_NG (Yoruba/Nigeria) */
+	       || lcid == 0x085f)	/* ber_DZ (Tamazight/Algeria) */
 	cs = "UTF-8";
       else if (lcid == 0x042e)		/* hsb_DE (Upper Sorbian/Germany) */
 	cs = "ISO-8859-2";
diff --git a/winsup/cygwin/regex/regcomp.c b/winsup/cygwin/regex/regcomp.c
index 5acfd4e52e61..9e29257cb3a8 100644
--- a/winsup/cygwin/regex/regcomp.c
+++ b/winsup/cygwin/regex/regcomp.c
@@ -827,7 +827,7 @@ p_b_term(struct parse *p, cset *cs)
 			CHadd(p, cs, start);
 		else {
 #ifdef __CYGWIN__
-			if (!__get_current_collate_locale ()->lcid) {
+			if (!__get_current_collate_locale ()->win_locale[0]) {
 #else
 			if (__collate_load_error) {
 #endif

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2023-02-24 15:42 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-24 15:42 [newlib-cygwin/main] Cygwin: convert Windows locale handling from LCID to ISO5646 strings Corinna Vinschen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).