public inbox for newlib-cvs@sourceware.org
help / color / mirror / Atom feed
* [newlib-cygwin] character data generation
@ 2018-03-12 10:40 Corinna Vinschen
0 siblings, 0 replies; only message in thread
From: Corinna Vinschen @ 2018-03-12 10:40 UTC (permalink / raw)
To: newlib-cvs
https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=a3527300049872d68dcaf0f9faf6bb2a86d50f45
commit a3527300049872d68dcaf0f9faf6bb2a86d50f45
Author: Thomas Wolff <towo@towo.net>
Date: Thu Mar 8 00:02:35 2018 +0100
character data generation
Diff:
---
newlib/libc/ctype/mkcaseconv | 128 +++++++++++++++++++++++++++++++++++++++++
newlib/libc/ctype/mkcategories | 69 ++++++++++++++++++++++
newlib/libc/ctype/mkunidata | 40 +++++++++++++
3 files changed, 237 insertions(+)
diff --git a/newlib/libc/ctype/mkcaseconv b/newlib/libc/ctype/mkcaseconv
new file mode 100755
index 0000000..ab0571d
--- /dev/null
+++ b/newlib/libc/ctype/mkcaseconv
@@ -0,0 +1,128 @@
+#! /bin/sh -f
+
+# generate a table for Unicode case conversion; entries:
+# struct caseconv_entry defined in towctrans_l.c
+
+if [ -r UnicodeData.txt ]
+then UnicodeData=UnicodeData.txt
+elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
+then UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
+else echo UnicodeData.txt not found >&2
+ exit 1
+fi
+
+LC_ALL=C
+export LC_ALL
+
+compact=true
+
+#0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
+#0061;LATIN SMALL LETTER A;Ll;0;L;;;;;N;;;0041;;0041
+#0130;LATIN CAPITAL LETTER I WITH DOT ABOVE;Lu;0;L;0049 0307;;;;N;LATIN CAPITAL LETTER I DOT;;;0069;
+#01C4;LATIN CAPITAL LETTER DZ WITH CARON;Lu;0;L;<compat> 0044 017D;;;;N;LATIN CAPITAL LETTER D Z HACEK;;;01C6;01C5
+#01C5;LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON;Lt;0;L;<compat> 0044 017E;;;;N;LATIN LETTER CAPITAL D SMALL Z HACEK;;01C4;01C6;01C5
+#01C6;LATIN SMALL LETTER DZ WITH CARON;Ll;0;L;<compat> 0064 017E;;;;N;LATIN SMALL LETTER D Z HACEK;;01C4;;01C5
+
+tr -d '\015' < $UnicodeData |
+sed \
+-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);\([^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
+-e t \
+-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);\([^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
+-e t \
+-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;]*\);\([^;][^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
+-e t \
+-e d |
+(#src 01C5 upper "01C4" lower "01C6" title "01C5"
+if $compact
+then
+ (
+ cat <<\/EOS
+ src () {
+ if [ -n "$3" ]
+ then tohi=$(( 0x0$3 - 0x0$1 ))
+ else tohi=0
+ fi
+ if [ -n "$5" ]
+ then tolo=$(( 0x0$5 - 0x0$1 ))
+ else tolo=0
+ fi
+ case "$tolo.$tohi" in
+ 0.0) true;;
+ 0.*)
+ case "$1.$tohi" in
+ *[02468ACE].1) echo "'#error' U+$1 ODDSML";;
+ *[02468ACE].-1) echo " 0x$1 TO1 ODDCAP";;
+ *[13579BDF].1) echo "'#error' U+$1 EVENSML";;
+ *[13579BDF].-1) echo " 0x$1 TO1 EVENCAP";;
+ *) echo " 0x$1 TOUP $tohi";;
+ esac;;
+ *.0)
+ case "$1.$tolo" in
+ *[02468ACE].1) echo " 0x$1 TO1 EVENCAP";;
+ *[02468ACE].-1) echo "'#error' U+$1 EVENSML";;
+ *[13579BDF].1) echo " 0x$1 TO1 ODDCAP";;
+ *[13579BDF].-1) echo "'#error' U+$1 ODDSML";;
+ *) echo " 0x$1 TOLO $tolo";;
+ esac;;
+ *) case "$tolo.$tohi" in
+ 1.-1) echo " 0x$1 TOBOTH 0";;
+ *) echo "'#error' U+$1";;
+ esac;;
+ esac
+ }
+/EOS
+ cat
+ ) | sh |
+ uniq -f1 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ," |
+ (
+ cat <<\/EOS
+ first=
+ diff=-1
+ max=255
+ range () {
+ # $diff == $(($last - $first))
+ if [ "$diff" -ge 0 ]
+ then # we have items at all
+ echo " {$first, $diff, $v2, $v3},"
+ fi
+ first=
+ diff=-1
+ }
+ item () {
+ if [ "$1" == "#error" ]
+ then echo "$*"
+ return
+ fi
+
+ if [ $diff -eq $max ]
+ then range
+ elif [ -n "$first" ]
+ then if [ $(( $1 )) -ne $(( ${last-0} + 1 )) ]
+ then range
+ fi
+ fi
+
+ if [ -z "$first" ]
+ then first=$1
+ v2=$2
+ v3=$3
+ fi
+
+ last=$1
+ diff=$(( $diff + 1 ))
+ }
+/EOS
+ cat
+ ) | sh
+elif false
+then
+ sed -e 's/src \([^ ]*\) upper "\([^ ]*\)" lower "\([^ ]*\)" title "\([^ ]*\)"/ {0x\1, 0x\2 - 0x\1, 0x\3 - 0x\1},/' \
+ -e 's/0x - 0x[^ ,}]*/0/g' -e 's/0x}/0}/' \
+ -e 's/\(0x[0-9A-F][0-9A-F]*\) - \(0x[0-9A-F][0-9A-F]*\)/$((`printf %d \1` - `printf %d \2`))/g' \
+ -e 's/^/echo "/' -e 's/$/"/' |
+ sh
+else
+ sed -e 's/src \([^ ]*\) upper "\([^ ]*\)" lower "\([^ ]*\)" title "\([^ ]*\)"/ {0x\1, 0x\2 - 0x\1, 0x\3 - 0x\1},/' \
+ -e 's/0x - 0x[^ ,}]*/0/g' -e 's/0x}/0}/'
+fi
+) > caseconv.t
diff --git a/newlib/libc/ctype/mkcategories b/newlib/libc/ctype/mkcategories
new file mode 100755
index 0000000..24dd93a
--- /dev/null
+++ b/newlib/libc/ctype/mkcategories
@@ -0,0 +1,69 @@
+#! /bin/sh
+
+# generate table of Unicode character category ranges;
+# note: undefined characters between two characters of the same category
+# are associated to the same category, e.g.
+#0A0A;GURMUKHI LETTER UU;Lo
+#0A0B..0A0E -> Lo
+#0A0F;GURMUKHI LETTER EE;Lo
+
+if [ -r UnicodeData.txt ]
+then UnicodeData=UnicodeData.txt
+elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
+then UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
+else echo UnicodeData.txt not found >&2
+ exit 1
+fi
+
+# the code assumes foldall=false, foldcase=true
+foldall=false
+foldcase=true
+
+(
+cat <<\/EOS
+first=
+item () {
+ if [ -n "$first" ]
+ then if [ $(( 0x$1 )) -ne $(( 0x${last-0} + 1 )) ]
+ then range
+ fi
+ fi
+
+ if [ -z "$first" ]
+ then first=$1
+ val=$2
+ fi
+
+ last=$1
+}
+range () {
+# echo " {0x$first, 0x$last, CAT_$val},"
+# echo " {0x$first, $((0x$last - 0x$first)), CAT_$val},"
+# echo " {0x$first | (CAT_$val << 24), $((0x$last - 0x$first))},"
+ echo " {CAT_$val, 0x$first, $((0x$last - 0x$first))},"
+ first=
+}
+/EOS
+
+cat "$UnicodeData" |
+if $foldall
+then sed -e "s,;L[lu];,;LC;," -e "s,;C[fs];,;Cfs;," \
+ -e "s,;L[mo];,;Lmo;," -e "s,;Nl;,;Lmo;," \
+ -e "s,;P.;,;P;," -e "s,;No;,;P;," \
+ -e "s,;S.;,;S;," -e "s,;Z[lp];,;Zlp;," \
+ -e "s,;C[no];,;X;," -e "s,;M[cen];,;M;,"
+elif $foldcase
+then
+# fold Lu/Ll to LC only if lower/upper conversion is available
+ sed -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);.*/ s/;Lu;/;LC;/' \
+ -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);.*/ s/;Ll;/;LC;/' \
+ -e '/;Co;/ d'
+else cat
+fi |
+sed -e "s,^\([^;]*\);[^;]*;\([^;]*\);.*,\1 \2," |
+uniq -f1 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ,"
+) | sh > categories.t
+
+sed -e "s/.*\(CAT_[A-Za-z]*\).*/ \1,/" categories.t |
+sort | uniq > categories.cat
+
diff --git a/newlib/libc/ctype/mkunidata b/newlib/libc/ctype/mkunidata
new file mode 100755
index 0000000..ea18e67
--- /dev/null
+++ b/newlib/libc/ctype/mkunidata
@@ -0,0 +1,40 @@
+#! /bin/sh
+
+echo generating Unicode character properties data for newlib/libc/ctype
+
+cd `dirname $0`
+
+#############################################################################
+# checks and (with option -u) download
+
+case "$1" in
+-u)
+ #WGET=wget -N -t 1 --timeout=55
+ WGET=curl -R -O --connect-timeout 55
+ WGET+=-z $@
+
+ echo downloading data from unicode.org
+ for data in UnicodeData.txt
+ do $WGET http://unicode.org/Public/UNIDATA/$data
+ done
+ ;;
+*) echo checking package unicode-ucd
+ grep unicode-ucd /etc/setup/installed.db || exit 9
+ ;;
+esac
+
+for data in UnicodeData.txt
+do test -r $data || ln -s /usr/share/unicode/ucd/$data . || exit 9
+done
+
+#############################################################################
+# table generation
+
+echo generating character category table for "isw*.c"
+ sh ./mkcategories
+
+echo generating case conversion table for "tow*.c"
+ sh ./mkcaseconv
+
+#############################################################################
+# end
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2018-03-12 10:40 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-12 10:40 [newlib-cygwin] character data generation Corinna Vinschen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).