From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <corinna@sourceware.org>
Received: by sourceware.org (Postfix, from userid 2155)
	id 7BAF73858035; Wed, 15 Feb 2023 21:37:45 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 7BAF73858035
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1676497065;
	bh=4SYhaAq/RhWF41Cf96tR8C3A8jPXiWOnaKlLTW1Qgz8=;
	h=From:To:Subject:Date:From;
	b=hiSjmP3ga235QVuEx3x+LePymlVUkxINEh4lRm/R8YZFb4p6L75iQeLjyTWlf8rYe
	 xOi+ZZKWhbw23zW2WdAbClbeB4U6Gef4+/GjBDJusAg62vXYmz9mOPZTfgacaBYqpg
	 uxSLxEMWiuHRjJ5Y1bXA7WfZTaGpT9EKuNXs6kUo=
Content-Type: text/plain; charset="utf-8"
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
From: Corinna Vinschen <corinna@sourceware.org>
To: cygwin-cvs@sourceware.org
Subject: [newlib-cygwin/main] Cygwin: is_unicode_equiv: implement Unicode equivalence class check
X-Act-Checkin: newlib-cygwin
X-Git-Author: Corinna Vinschen <corinna@vinschen.de>
X-Git-Refname: refs/heads/main
X-Git-Oldrev: be67844f5da8cb6fb4c39cfdb3460e6955660b8e
X-Git-Newrev: b5f9b0241a36b8e0197405fc5ab23cbf0ba43e03
Message-Id: <20230215213745.7BAF73858035@sourceware.org>
Date: Wed, 15 Feb 2023 21:37:45 +0000 (GMT)
List-Id: <cygwin-cvs.sourceware.org>

https://sourceware.org/git/gitweb.cgi?p=3Dnewlib-cygwin.git;h=3Db5f9b0241a3=
6b8e0197405fc5ab23cbf0ba43e03

commit b5f9b0241a36b8e0197405fc5ab23cbf0ba43e03
Author:     Corinna Vinschen <corinna@vinschen.de>
AuthorDate: Wed Feb 15 22:00:39 2023 +0100
Commit:     Corinna Vinschen <corinna@vinschen.de>
CommitDate: Wed Feb 15 22:00:39 2023 +0100

    Cygwin: is_unicode_equiv: implement Unicode equivalence class check
   =20
    is_unicode_equiv compares two UTF-32 values and returns 1 if
    both are member of the same Unicode equivalence class, 0 otherwise.
   =20
    Note that this function only works with precomposed characters
    per Unicode normalization form C.  It doesn't handle decomposed
    characters, just like its counterpart in glibc.  I.e., equivalence
    class comparison using decomposed chars won't work.  Example:
   =20
      fnmatch("[=3Dn=3D]", "=C3=B1") =3D=3D 0
      fnmatch("[=3D=C3=B1=3D]", "n") =3D=3D 0
   =20
    but
   =20
      fnmatch("[=3Dn=3D]", "n\x0303") =3D=3D 1
      fnmatch("[=3Dn\x0303=3D]", "n") =3D=3D 1
      fnmatch("[=3Dn\x0303=3D]", "n\x0303") =3D=3D 1
   =20
    Signed-off-by: Corinna Vinschen <corinna@vinschen.de>

Diff:
---
 winsup/cygwin/local_includes/collate.h |  2 ++
 winsup/cygwin/nlsfuncs.cc              | 48 ++++++++++++++++++++++++++++++=
++++
 2 files changed, 50 insertions(+)

diff --git a/winsup/cygwin/local_includes/collate.h b/winsup/cygwin/local_i=
ncludes/collate.h
index a89829a8336d..c3454575dd92 100644
--- a/winsup/cygwin/local_includes/collate.h
+++ b/winsup/cygwin/local_includes/collate.h
@@ -15,6 +15,8 @@ extern const int __collate_load_error;
=20
 extern int __collate_range_cmp (int c1, int c2);
=20
+int is_unicode_equiv (wint_t, wint_t);
+
 #ifdef __cplusplus
 };
 #endif
diff --git a/winsup/cygwin/nlsfuncs.cc b/winsup/cygwin/nlsfuncs.cc
index 0d204929d24c..f3701312b92b 100644
--- a/winsup/cygwin/nlsfuncs.cc
+++ b/winsup/cygwin/nlsfuncs.cc
@@ -11,6 +11,7 @@ details. */
 #include <stdlib.h>
 #include <locale.h>
 #include <wchar.h>
+#include <wctype.h>
 #include "path.h"
 #include "fhandler.h"
 #include "dtable.h"
@@ -1110,6 +1111,7 @@ __collate_load_locale (struct __locale_t *locale, con=
st char *name,
 /* We use the Windows functions for locale-specific string comparison and
    transformation.  The advantage is that we don't need any files with
    collation information. */
+
 extern "C" int
 wcscoll_l (const wchar_t *__restrict ws1, const wchar_t *__restrict ws2,
 	   struct __locale_t *locale)
@@ -1193,6 +1195,52 @@ __collate_range_cmp (int c1, int c2)
   return wcscoll (s1, s2);
 }
=20
+/* Check if UTF-32 input character `test' is in the same equivalence class
+   as the multibyte char in `equiv'.
+   Note that we only recognize input in Unicode normalization form C, that
+   is, we expect all letters to be composed.  A single character is all we
+   look at.
+   To check equivalence, decompose pattern letter and input letter and che=
ck
+   the base character for equality.  Also, convert all digits to the ASCII
+   digits 0 - 9 and compare. */
+extern "C" int
+is_unicode_equiv (wint_t test, wint_t eqv)
+{
+	wchar_t decomp_testc[5] =3D { 0 };
+	wchar_t decomp_eqvc[5] =3D { 0 };
+	wchar_t testc[3] =3D { 0 };
+	wchar_t eqvc[3] =3D { 0 };
+
+	/* For equivalence classes, case doesn't matter.  However, be careful.
+	   Only convert chars which have a "upper" to "lower". */
+	if (iswupper (eqv))
+		eqv =3D towlower (eqv);
+	if (iswupper (test))
+		test =3D towlower (test);
+	/* Convert to UTF-16 string */
+	if (eqv > 0x10000) {
+		eqvc[0] =3D ((eqv - 0x10000) >> 10) + 0xd800;
+		eqvc[1] =3D ((eqv - 0x10000) & 0x3ff) + 0xdc00;
+	} else
+		eqvc[0] =3D eqv;
+	if (test > 0x10000) {
+		testc[0] =3D ((test - 0x10000) >> 10) + 0xd800;
+		testc[1] =3D ((test - 0x10000) & 0x3ff) + 0xdc00;
+	} else
+		testc[0] =3D test;
+	/* Convert to denormalized form */
+	FoldStringW (MAP_COMPOSITE | MAP_FOLDDIGITS, eqvc, -1, decomp_eqvc, 5);
+	FoldStringW (MAP_COMPOSITE | MAP_FOLDDIGITS, testc, -1, decomp_testc, 5);
+	/* If they are equivalent, the base char must be the same. */
+	if (decomp_eqvc[0] !=3D decomp_testc[0])
+		return 0;
+	/* If it's a surrogate pair, check the second char, too */
+	if (decomp_eqvc[0] >=3D 0xd800 && decomp_eqvc[0] <=3D 0xdbff &&
+	    decomp_eqvc[1] !=3D decomp_testc[1])
+		return 0;
+	return 1;
+}
+
 extern "C" size_t
 wcsxfrm_l (wchar_t *__restrict ws1, const wchar_t *__restrict ws2, size_t =
wsn,
 	   struct __locale_t *locale)