[newlib-cygwin] character data generation

public inbox for newlib-cvs@sourceware.org
help / color / mirror / Atom feed

* [newlib-cygwin] character data generation
@ 2018-03-12 10:40 Corinna Vinschen
  0 siblings, 0 replies; only message in thread
From: Corinna Vinschen @ 2018-03-12 10:40 UTC (permalink / raw)
  To: newlib-cvs

https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=a3527300049872d68dcaf0f9faf6bb2a86d50f45

commit a3527300049872d68dcaf0f9faf6bb2a86d50f45
Author: Thomas Wolff <towo@towo.net>
Date:   Thu Mar 8 00:02:35 2018 +0100

    character data generation

Diff:
---
 newlib/libc/ctype/mkcaseconv   | 128 +++++++++++++++++++++++++++++++++++++++++
 newlib/libc/ctype/mkcategories |  69 ++++++++++++++++++++++
 newlib/libc/ctype/mkunidata    |  40 +++++++++++++
 3 files changed, 237 insertions(+)

diff --git a/newlib/libc/ctype/mkcaseconv b/newlib/libc/ctype/mkcaseconv
new file mode 100755
index 0000000..ab0571d
--- /dev/null
+++ b/newlib/libc/ctype/mkcaseconv
@@ -0,0 +1,128 @@
+#! /bin/sh -f
+
+# generate a table for Unicode case conversion; entries:
+# struct caseconv_entry defined in towctrans_l.c
+
+if [ -r UnicodeData.txt ]
+then	UnicodeData=UnicodeData.txt
+elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
+then	UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
+else	echo UnicodeData.txt not found >&2
+	exit 1
+fi
+
+LC_ALL=C
+export LC_ALL
+
+compact=true
+
+#0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
+#0061;LATIN SMALL LETTER A;Ll;0;L;;;;;N;;;0041;;0041
+#0130;LATIN CAPITAL LETTER I WITH DOT ABOVE;Lu;0;L;0049 0307;;;;N;LATIN CAPITAL LETTER I DOT;;;0069;
+#01C4;LATIN CAPITAL LETTER DZ WITH CARON;Lu;0;L;<compat> 0044 017D;;;;N;LATIN CAPITAL LETTER D Z HACEK;;;01C6;01C5
+#01C5;LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON;Lt;0;L;<compat> 0044 017E;;;;N;LATIN LETTER CAPITAL D SMALL Z HACEK;;01C4;01C6;01C5
+#01C6;LATIN SMALL LETTER DZ WITH CARON;Ll;0;L;<compat> 0064 017E;;;;N;LATIN SMALL LETTER D Z HACEK;;01C4;;01C5
+
+tr -d '\015' < $UnicodeData |
+sed \
+-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);\([^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
+-e t \
+-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);\([^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
+-e t \
+-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;]*\);\([^;][^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
+-e t \
+-e d |
+(#src 01C5 upper "01C4" lower "01C6" title "01C5"
+if $compact
+then
+  (
+  cat <<\/EOS
+  src () {
+    if [ -n "$3" ]
+    then	tohi=$(( 0x0$3 - 0x0$1 ))
+    else	tohi=0
+    fi
+    if [ -n "$5" ]
+    then	tolo=$(( 0x0$5 - 0x0$1 ))
+    else	tolo=0
+    fi
+    case "$tolo.$tohi" in
+    0.0)	true;;
+    0.*)
+	case "$1.$tohi" in
+	*[02468ACE].1)	echo "'#error' U+$1 ODDSML";;
+	*[02468ACE].-1)	echo "  0x$1 TO1 ODDCAP";;
+	*[13579BDF].1)	echo "'#error' U+$1 EVENSML";;
+	*[13579BDF].-1)	echo "  0x$1 TO1 EVENCAP";;
+	*)		echo "  0x$1 TOUP $tohi";;
+	esac;;
+    *.0)
+	case "$1.$tolo" in
+	*[02468ACE].1)	echo "  0x$1 TO1 EVENCAP";;
+	*[02468ACE].-1)	echo "'#error' U+$1 EVENSML";;
+	*[13579BDF].1)	echo "  0x$1 TO1 ODDCAP";;
+	*[13579BDF].-1)	echo "'#error' U+$1 ODDSML";;
+	*)		echo "  0x$1 TOLO $tolo";;
+	esac;;
+    *)	case "$tolo.$tohi" in
+	1.-1)		echo "  0x$1 TOBOTH 0";;
+	*)		echo "'#error' U+$1";;
+	esac;;
+    esac
+  }
+/EOS
+  cat
+  ) | sh |
+  uniq -f1 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ," |
+  (
+  cat <<\/EOS
+  first=
+  diff=-1
+  max=255
+  range () {
+	# $diff == $(($last - $first))
+	if [ "$diff" -ge 0 ]
+	then	# we have items at all
+		echo "  {$first, $diff, $v2, $v3},"
+	fi
+	first=
+	diff=-1
+  }
+  item () {
+	if [ "$1" == "#error" ]
+	then	echo "$*"
+		return
+	fi
+
+	if [ $diff -eq $max ]
+	then	range
+	elif [ -n "$first" ]
+	then	if [ $(( $1 )) -ne $(( ${last-0} + 1 )) ]
+		then	range
+		fi
+	fi
+
+	if [ -z "$first" ]
+	then	first=$1
+		v2=$2
+		v3=$3
+	fi
+
+	last=$1
+	diff=$(( $diff + 1 ))
+  }
+/EOS
+  cat
+  ) | sh
+elif false
+then
+  sed -e 's/src \([^ ]*\) upper "\([^ ]*\)" lower "\([^ ]*\)" title "\([^ ]*\)"/  {0x\1, 0x\2 - 0x\1, 0x\3 - 0x\1},/' \
+      -e 's/0x - 0x[^ ,}]*/0/g' -e 's/0x}/0}/' \
+      -e 's/\(0x[0-9A-F][0-9A-F]*\) - \(0x[0-9A-F][0-9A-F]*\)/$((`printf %d \1` - `printf %d \2`))/g' \
+      -e 's/^/echo "/' -e 's/$/"/' |
+  sh
+else
+  sed -e 's/src \([^ ]*\) upper "\([^ ]*\)" lower "\([^ ]*\)" title "\([^ ]*\)"/  {0x\1, 0x\2 - 0x\1, 0x\3 - 0x\1},/' \
+      -e 's/0x - 0x[^ ,}]*/0/g' -e 's/0x}/0}/'
+fi
+) > caseconv.t
diff --git a/newlib/libc/ctype/mkcategories b/newlib/libc/ctype/mkcategories
new file mode 100755
index 0000000..24dd93a
--- /dev/null
+++ b/newlib/libc/ctype/mkcategories
@@ -0,0 +1,69 @@
+#! /bin/sh
+
+# generate table of Unicode character category ranges;
+# note: undefined characters between two characters of the same category
+# are associated to the same category, e.g.
+#0A0A;GURMUKHI LETTER UU;Lo
+#0A0B..0A0E           -> Lo
+#0A0F;GURMUKHI LETTER EE;Lo
+
+if [ -r UnicodeData.txt ]
+then	UnicodeData=UnicodeData.txt
+elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
+then	UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
+else	echo UnicodeData.txt not found >&2
+	exit 1
+fi
+
+# the code assumes foldall=false, foldcase=true
+foldall=false
+foldcase=true
+
+(
+cat <<\/EOS
+first=
+item () {
+	if [ -n "$first" ]
+	then	if [ $(( 0x$1 )) -ne $(( 0x${last-0} + 1 )) ]
+		then	range
+		fi
+	fi
+
+	if [ -z "$first" ]
+	then	first=$1
+		val=$2
+	fi
+
+	last=$1
+}
+range () {
+#	echo "    {0x$first, 0x$last, CAT_$val},"
+#	echo "    {0x$first, $((0x$last - 0x$first)), CAT_$val},"
+#	echo "    {0x$first | (CAT_$val << 24), $((0x$last - 0x$first))},"
+	echo "    {CAT_$val, 0x$first, $((0x$last - 0x$first))},"
+	first=
+}
+/EOS
+
+cat "$UnicodeData" |
+if $foldall
+then sed -e "s,;L[lu];,;LC;," -e "s,;C[fs];,;Cfs;," \
+	 -e "s,;L[mo];,;Lmo;," -e "s,;Nl;,;Lmo;," \
+	 -e "s,;P.;,;P;,"  -e "s,;No;,;P;," \
+	 -e "s,;S.;,;S;," -e "s,;Z[lp];,;Zlp;," \
+	 -e "s,;C[no];,;X;," -e "s,;M[cen];,;M;,"
+elif $foldcase
+then
+# fold Lu/Ll to LC only if lower/upper conversion is available
+ sed -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);.*/ s/;Lu;/;LC;/' \
+     -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);.*/ s/;Ll;/;LC;/' \
+     -e '/;Co;/ d'
+else cat
+fi |
+sed -e "s,^\([^;]*\);[^;]*;\([^;]*\);.*,\1	\2," |
+uniq -f1 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ,"
+) | sh > categories.t
+
+sed -e "s/.*\(CAT_[A-Za-z]*\).*/  \1,/" categories.t |
+sort | uniq > categories.cat
+
diff --git a/newlib/libc/ctype/mkunidata b/newlib/libc/ctype/mkunidata
new file mode 100755
index 0000000..ea18e67
--- /dev/null
+++ b/newlib/libc/ctype/mkunidata
@@ -0,0 +1,40 @@
+#! /bin/sh
+
+echo generating Unicode character properties data for newlib/libc/ctype
+
+cd `dirname $0`
+
+#############################################################################
+# checks and (with option -u) download
+
+case "$1" in
+-u)
+	#WGET=wget -N -t 1 --timeout=55
+	WGET=curl -R -O --connect-timeout 55
+	WGET+=-z $@
+
+	echo downloading data from unicode.org
+	for data in UnicodeData.txt
+	do	$WGET http://unicode.org/Public/UNIDATA/$data
+	done
+	;;
+*)	echo checking package unicode-ucd
+	grep unicode-ucd /etc/setup/installed.db || exit 9
+	;;
+esac
+
+for data in UnicodeData.txt
+do	test -r $data || ln -s /usr/share/unicode/ucd/$data . || exit 9
+done
+
+#############################################################################
+# table generation
+
+echo generating character category table for "isw*.c"
+	sh ./mkcategories
+
+echo generating case conversion table for "tow*.c"
+	sh ./mkcaseconv
+
+#############################################################################
+# end


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2018-03-12 10:40 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-12 10:40 [newlib-cygwin] character data generation Corinna Vinschen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).