From 9c5d6b1adcf949269e3fceeaf31203921745d2c9 Mon Sep 17 00:00:00 2001 From: mintty Date: Mon, 14 Aug 2017 21:59:25 +0200 Subject: [PATCH 1/4] creation of width data, supporting Unicode updates --- newlib/libc/string/Makefile.widthdata | 47 +++ newlib/libc/string/mkwide | 49 +++ newlib/libc/string/mkwidthA | 20 + newlib/libc/string/uniset | 678 ++++++++++++++++++++++++++++++++++ 4 files changed, 794 insertions(+) create mode 100644 newlib/libc/string/Makefile.widthdata create mode 100755 newlib/libc/string/mkwide create mode 100755 newlib/libc/string/mkwidthA create mode 100755 newlib/libc/string/uniset diff --git a/newlib/libc/string/Makefile.widthdata b/newlib/libc/string/Makefile.widthdata new file mode 100644 index 0000000..14adab5 --- /dev/null +++ b/newlib/libc/string/Makefile.widthdata @@ -0,0 +1,47 @@ +############################################################################# +# generate Unicode width data for newlib/libc/string/wcwidth.c + + +############################################################################# +# table sets to be generated + +widthdata=combining.t ambiguous.t wide.t + +widthdata: $(widthdata) + + +############################################################################# +# tools and data + +#WGET=wget -N -t 1 --timeout=55 +WGET=curl -R -O --connect-timeout 55 +WGET+=-z $@ + +%.txt: + ln -s /usr/share/unicode/ucd/$@ . || $(WGET) http://unicode.org/Public/UNIDATA/$@ + +uniset.tar.gz: + $(WGET) http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz + +uniset: uniset.tar.gz + gzip -dc uniset.tar.gz | tar xvf - uniset + + +############################################################################# +# width data for libc/string/wcwidth.c + +combining.t: uniset UnicodeData.txt Blocks.txt + PATH="${PATH}:." uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B +D7B0-D7C6 +D7CB-D7FB c > combining.t + +WIDTH-A: uniset UnicodeData.txt Blocks.txt EastAsianWidth.txt + PATH="${PATH}:." sh ./mkwidthA + +ambiguous.t: uniset WIDTH-A UnicodeData.txt Blocks.txt + PATH="${PATH}:." uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c > ambiguous.t + +wide.t: uniset UnicodeData.txt Blocks.txt EastAsianWidth.txt + PATH="${PATH}:." sh ./mkwide + + +############################################################################# +# end diff --git a/newlib/libc/string/mkwide b/newlib/libc/string/mkwide new file mode 100755 index 0000000..55a0bab --- /dev/null +++ b/newlib/libc/string/mkwide @@ -0,0 +1,49 @@ +#! /bin/sh + +# generate list of wide characters, with convex closure + +skipcheck=false + +if [ ! -r EastAsianWidth.txt ] +then ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1 +fi +if [ ! -r UnicodeData.txt ] +then ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1 +fi +if [ ! -r Blocks.txt ] +then ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1 +fi + +sed -e "s,^\([^;]*\);[NAH],\1," -e t -e d EastAsianWidth.txt > wide.na +sed -e "s,^\([^;]*\);[WF],\1," -e t -e d EastAsianWidth.txt > wide.fw + +PATH="$PATH:." # for uniset + +nrfw=`uniset +wide.fw nr | sed -e 's,.*:,,'` +echo FW $nrfw +nrna=`uniset +wide.na nr | sed -e 's,.*:,,'` +echo NAH $nrna + +extrablocks="2E80-303E" + +# check all blocks +includes () { + nr=`uniset +wide.$2 -$1 nr | sed -e 's,.*:,,'` + test $nr != $3 +} +echo "adding compact closure of wide ranges, this may take ~10min" +for b in $extrablocks `sed -e 's,^\([0-9A-F]*\)\.\.\([0-9A-F]*\).*,\1-\2,' -e t -e d Blocks.txt` +do range=$b + echo checking $range $* >&2 + if includes $range fw $nrfw && ! includes $range na $nrna + then echo $range + fi +done > wide.blocks + +( +sed -e "s,^,//," -e 1q EastAsianWidth.txt +sed -e "s,^,//," -e 1q Blocks.txt +uniset `sed -e 's,^,+,' wide.blocks` +wide.fw c +) > wide.t + +rm -f wide.na wide.fw wide.blocks diff --git a/newlib/libc/string/mkwidthA b/newlib/libc/string/mkwidthA new file mode 100755 index 0000000..343ab40 --- /dev/null +++ b/newlib/libc/string/mkwidthA @@ -0,0 +1,20 @@ +#! /bin/sh + +# generate WIDTH-A file, listing Unicode characters with width property +# Ambiguous, from EastAsianWidth.txt + +if [ ! -r EastAsianWidth.txt ] +then ln -s /usr/share/unicode/ucd/EastAsianWidth.txt . || exit 1 +fi +if [ ! -r UnicodeData.txt ] +then ln -s /usr/share/unicode/ucd/UnicodeData.txt . || exit 1 +fi +if [ ! -r Blocks.txt ] +then ln -s /usr/share/unicode/ucd/Blocks.txt . || exit 1 +fi + +sed -e "s,^\([^;]*\);A,\1," -e t -e d EastAsianWidth.txt > width-a-new +rm -f WIDTH-A +echo "# UAX #11: East Asian Ambiguous" > WIDTH-A +PATH="$PATH:." uniset +width-a-new compact >> WIDTH-A +rm -f width-a-new diff --git a/newlib/libc/string/uniset b/newlib/libc/string/uniset new file mode 100755 index 0000000..415e219 --- /dev/null +++ b/newlib/libc/string/uniset @@ -0,0 +1,678 @@ +#!/usr/bin/perl +# Uniset -- Unicode subset manager -- Markus Kuhn +# http://www.cl.cam.ac.uk/~mgk25/download/uniset.tar.gz +# $Id: uniset,v 1.18 2004-04-10 21:19:39+01 mgk25 Exp mgk25 $ + +require 5.008; +use open ':utf8'; + +binmode(STDOUT, ":utf8"); +binmode(STDIN, ":utf8"); + +my (%name, %invname, %category, %comment); + +print <. + +yyyy yyyy (optionally prefixed with 0x) is a Unicode character + belonging to the specified subset. + +yyyy-yyyy a range of Unicode characters belonging to +yyyy..yyyy the specified subset. + +xx yy yy yy-yy yy xx denotes a row (high-byte) and the yy specify + corresponding low bytes or with a hyphen also ranges of + low bytes in the Unicode values that belong to this + subset. This is also the format that is generated by + the compact command. +End +exit 1 if $#ARGV < 0; + + +# Subroutine to identify whether the ISO 10646/Unicode character code +# ucs belongs into the East Asian Wide (W) or East Asian FullWidth +# (F) category as defined in Unicode Technical Report #11. + +sub iswide ($) { + my $ucs = shift(@_); + + return ($ucs >= 0x1100 && + ($ucs <= 0x115f || # Hangul Jamo + $ucs == 0x2329 || $ucs == 0x232a || + ($ucs >= 0x2e80 && $ucs <= 0xa4cf && + $ucs != 0x303f) || # CJK .. Yi + ($ucs >= 0xac00 && $ucs <= 0xd7a3) || # Hangul Syllables + ($ucs >= 0xf900 && $ucs <= 0xfaff) || # CJK Comp. Ideographs + ($ucs >= 0xfe30 && $ucs <= 0xfe6f) || # CJK Comp. Forms + ($ucs >= 0xff00 && $ucs <= 0xff60) || # Fullwidth Forms + ($ucs >= 0xffe0 && $ucs <= 0xffe6) || + ($ucs >= 0x20000 && $ucs <= 0x2fffd) || + ($ucs >= 0x30000 && $ucs <= 0x3fffd))); +} + +# Return the Unicode name that belongs to a given character code + +# Jamo short names, see Unicode 3.0, table 4-4, page 86 + +my @lname = ('G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', + 'J', 'JJ', 'C', 'K', 'T', 'P', 'H'); # 1100..1112 +my @vname = ('A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', + 'WA', 'WAE', 'OE', 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', + 'EU', 'YI', 'I'); # 1161..1175 +my @tname = ('G', 'GG', 'GS', 'N', 'NJ', 'NH', 'D', 'L', 'LG', 'LM', + 'LB', 'LS', 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', + 'NG', 'J', 'C', 'K', 'T', 'P', 'H'); # 11a8..11c2 + +sub name { + my $ucs = shift(@_); + + # The intervals used here reflect Unicode Version 3.2 + if (($ucs >= 0x3400 && $ucs <= 0x4db5) || + ($ucs >= 0x4e00 && $ucs <= 0x9fa5) || + ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) { + return "CJK UNIFIED IDEOGRAPH-" . sprintf("%04X", $ucs); + } + + if ($ucs >= 0xac00 && $ucs <= 0xd7a3) { + my $s = $ucs - 0xac00; + my $l = 0x1100 + int($s / (21 * 28)); + my $v = 0x1161 + int(($s % (21 * 28)) / 28); + my $t = 0x11a7 + $s % 28; + return "HANGUL SYLLABLE " . + ($lname[int($s / (21 * 28))] . + $vname[int(($s % (21 * 28)) / 28)] . + $tname[$s % 28 - 1]); + } + + return $name{$ucs}; +} + +sub is_unicode { + my $ucs = shift(@_); + + # The intervals used here reflect Unicode Version 3.2 + if (($ucs >= 0x3400 && $ucs <= 0x4db5) || + ($ucs >= 0x4e00 && $ucs <= 0x9fa5) || + ($ucs >= 0xac00 && $ucs <= 0xd7a3) || + ($ucs >= 0x20000 && $ucs <= 0x2a6d6)) { + return 1; + } + + return exists $name{$ucs}; +} + + +my $html = 0; +my $image = 0; +my $adducs = 0; +my $unicodedata = "UnicodeData.txt"; +my $blockdata = "Blocks.txt"; +my $datadir = "$ENV{HOME}/local/lib/ucs"; + +# read list of all Unicode names +if (!open(UDATA, $unicodedata) && !open(UDATA, "$datadir/$unicodedata")) { + die ("Can't open Unicode database '$unicodedata':\n$!\n\n" . + "Please make sure that you have downloaded the file\n" . + "ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt\n"); +} +while () { + if (/^([0-9,A-F]{4,8});([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*);([^;]*)$/) { + next if $2 ne '' && substr($2, 0, 1) eq '<'; + $ucs = hex($1); + $name{$ucs} = $2; + $invname{$2} = $ucs; + $category{$ucs} = $3; + $comment{$ucs} = $12; + } else { + die("Syntax error in line '$_' in file '$unicodedata'"); + } +} +close(UDATA); + +# read list of all Unicode blocks +if (!open(UDATA, $blockdata) && !open(UDATA, "$datadir/$blockdata")) { + die ("Can't open Unicode blockname list '$blockdata':\n$!\n\n" . + "Please make sure that you have downloaded the file\n" . + "ftp://ftp.unicode.org/Public/UNIDATA/Blocks.txt\n"); +} +my $blocks = 0; +my (@blockstart, @blockend, @blockname); +while () { + if (/^\s*([0-9,A-F]{4,8})\s*\.\.\s*([0-9,A-F]{4,8})\s*;\s*(.*)$/) { + $blockstart[$blocks] = hex($1); + $blockend [$blocks] = hex($2); + $blockname [$blocks] = $3; + $blocks++; + } elsif (/^\s*\#/ || /^\s*$/) { + # ignore comments and empty lines + } else { + die("Syntax error in line '$_' in file '$blockdata'"); + } +} +close(UDATA); +if ($blockend[$blocks-1] < 0x110000) { + $blockstart[$blocks] = 0x110000; + $blockend [$blocks] = 0x7FFFFFFF; + $blockname [$blocks] = "Beyond Plane 16"; + $blocks++; +} + +# process command line arguments +while ($_ = shift(@ARGV)) { + if (/^html$/) { + $html = 1; + } elsif (/^ucs$/) { + $adducs = 1; + } elsif (/^img$/) { + $html = 1; + $image = 1; + } elsif (/^template$/) { + $template = shift(@ARGV); + open(TEMPLATE, $template) || die("Can't open template file '$template': '$!'"); + while (