Use UnicodeData.txt to create UTF-8 ctype map.

This should provide more complete coverage of currently defined Unicode
characters as compared to manually assembled one we use currently.

Comparison of original and new UTF-8 ctype maps by character class:

TYPE    ORIG    NEW
alnum   94229   126029
alpha   93557   125419
blank   4       2
cntrl   73      137685
digit   469     622
graph   109615  137203
lower   1478    2145
print   109641  137222
punct   3428    797
rune    110481  274907
space   33      24
upper   983     1781
xdigit  469     622

Large number of added cntrl definitions is due to the fact that private-use
planes are currently defined as such, this can change in the future.

Discussed with:	bapt
Approved by:	kib (mentor, implicit)
MFC after:	1 month
Differential revision:	https://reviews.freebsd.org/D17842
This commit is contained in:
yuripv 2018-11-17 10:36:00 +00:00
parent 82a106e39f
commit c6e4d24106
5 changed files with 34218 additions and 13092 deletions

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,6 @@ COLLATIONS_SPECIAL_ENV+= ${area}.${enc}
PASSON+= COLLATIONS_SPECIAL="${COLLATIONS_SPECIAL_ENV}"
all:
cp ${ETCDIR}/common.UTF-8.src ${UNIDIR}/posix/xx_Comm_C.UTF-8.src
.for t in ${TYPES}
. if ${KNOWN:M${t}}
test -d ${t} || mkdir ${t}
@ -88,7 +87,7 @@ build-${t}: gen-${t}
env ${PASSON} tools/finalize ${t}
.endfor
gen-ctypedef: transfer-rollup
gen-ctypedef: ctype-rollup
static-colldef: gen-colldef
build-colldef: static-colldef
@ -98,13 +97,8 @@ static-colldef:
colldef.draft/${area}.${enc}.src
.endfor
transfer-rollup:
cp ${ETCDIR}/common.UTF-8.src ${UNIDIR}/posix/xx_Comm_C.UTF-8.src
rollup:
perl -I tools tools/utf8-rollup.pl \
--unidir=$$(realpath ${UNIDIR}) \
--etc=$$(realpath ${ETCDIR})
ctype-rollup:
perl -I tools tools/utf8-rollup.pl --unidir=$$(realpath ${UNIDIR})
clean:
.for t in ${TYPES}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -2,126 +2,25 @@
# $FreeBSD$
use strict;
#use File::Copy;
#use XML::Parser;
use Tie::IxHash;
#use Data::Dumper;
use Getopt::Long;
#use Digest::SHA qw(sha1_hex);
#require "charmaps.pm";
if ($#ARGV != 1) {
print "Usage: $0 --unidir=<unidir> --etc=<etcdir>\n";
if ($#ARGV != 0) {
print "Usage: $0 --unidir=<unidir>\n";
exit(1);
}
my $UNIDIR = undef;
my $ETCDIR = undef;
my $result = GetOptions (
"unidir=s" => \$UNIDIR,
"etc=s" => \$ETCDIR,
"unidir=s" => \$UNIDIR
);
my @SECTIONS = (
["en_US", "* 0x0000 - 0x007F Basic Latin\n" .
"* 0x0080 - 0x00FF Latin-1 Supplement\n" .
"* 0x0100 - 0x017F Latin Extended-A\n" .
"* 0x0180 - 0x024F Latin Extended-B\n" .
"* 0x0250 - 0x02AF IPA Extensions\n" .
"* 0x1D00 - 0x1D7F Phonetic Extensions\n" .
"* 0x1D80 - 0x1DBF Phonetic Extensions Supplement\n" .
"* 0x1E00 - 0x1EFF Latin Extended Additional\n" .
"* 0x2150 - 0x218F Number Forms (partial - Roman Numerals)\n".
"* 0x2C60 - 0x2C7F Latin Extended-C\n" .
"* 0xA720 - 0xA7FF Latin Extended-D\n" .
"* 0xAB30 - 0xAB6F Latin Extended-E\n" .
"* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n".
"* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
["el_GR", "* 0x0370 - 0x03FF Greek (No Coptic!)\n" .
"* 0x1F00 - 0x1FFF Greek Extended\n"],
["ru_RU", "* 0x0400 - 0x04FF Cyrillic\n" .
"* 0x0500 - 0x052F Cyrillic Supplementary\n" .
"* 0x2DE0 - 0x2DFF Cyrillic Extended-A\n" .
"* 0xA640 - 0xA69F Cyrillic Extended-B\n"],
["hy_AM", "* 0x0530 - 0x058F Armenian\n" .
"* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"],
["he_IL", "* 0x0590 - 0x05FF Hebrew\n" .
"* 0xFB00 - 0xFF4F Alphabetic Presentation Forms (partial)\n"],
["ar_SA", "* 0x0600 - 0x06FF Arabic\n" .
"* 0x0750 - 0x074F Arabic Supplement\n" .
"* 0x08A0 - 0x08FF Arabic Extended-A\n" .
"* 0xFB50 - 0xFDFF Arabic Presentation Forms (partial)\n" .
"* 0xFE70 - 0xFEFF Arabic Presentation Forms-B (partial)\n"],
["hi_IN", "* 0x0900 - 0x097F Devanagari\n" .
"* 0xA8E0 - 0xA8FF Devanagari Extended\n"],
["bn_IN", "* 0x0900 - 0x097F Bengali\n"],
["pa_Guru_IN", "* 0x0A00 - 0x0A7F Gurmukhi\n"],
["gu_IN", "* 0x0A80 - 0x0AFF Gujarati\n"],
["or_IN", "* 0x0B00 - 0x0B7F Oriya\n"],
["ta_IN", "* 0x0B80 - 0x0BFF Tamil\n"],
["te_IN", "* 0x0C00 - 0x0C7F Telugu\n"],
["kn_IN", "* 0x0C80 - 0x0CFF Kannada\n"],
["ml_IN", "* 0x0D00 - 0x0D7F Malayalam\n"],
["si_LK", "* 0x0D80 - 0x0DFF Sinhala\n"],
["th_TH", "* 0x0E00 - 0x0E7F Thai\n"],
["lo_LA", "* 0x0E80 - 0x0EFF Lao\n"],
["bo_IN", "* 0x0F00 - 0x0FFF Tibetan\n"],
["my_MM", "* 0x1000 - 0x109F Myanmar\n" .
"* 0xA9E0 - 0xA9FF Myanmar Extended-B\n" .
"* 0xAA60 - 0xAA7F Myanmar Extended-A\n"],
["ka_GE", "* 0x10A0 - 0x10FF Georgia\n" .
"* 0x2D00 - 0x2D2F Georgian Supplement\n"],
["ja_JP", "* 0x1100 - 0x11FF Hangul Jamo\n" .
"* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" .
"* 0x3040 - 0x309F Hiragana\n" .
"* 0x30A0 - 0x30FF Katakana\n" .
"* 0x31F0 - 0x31FF Katakana Phonetic Extensions\n" .
"* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" .
"* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" .
"* 0x3300 - 0x33FF CJK Compatibility\n" .
"* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension-A (added)\n" .
"* 0x4E00 - 0x9FCC CJK Unified Ideographs (overridden)\n" .
"* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" .
"* 0xD7B0 - 0xD7FF Hangul Jamo Extended-B\n" .
"* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n" .
"* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
["am_ET", "* 0x1200 - 0x137F Ethiopic\n" .
"* 0x1380 - 0x139F Ethiopic Supplement\n" .
"* 0x2D80 - 0x2DDF Ethiopic Extended\n" .
"* 0xAB00 - 0xAB2F Ethiopic Extended-A\n"],
["chr_US", "* 0x13A0 - 0x13FF Cherokee\n"],
["km_KH", "* 0x1780 - 0x17FF Khmer\n" .
"* 0x19E0 - 0x19FF Khmer Symbols\n"],
["shi_Tfng_MA", "* 0x2D30 - 0x2D2F Tifinagh\n"],
["ii_CN", "* 0xA000 - 0xA48F Yi Syllables\n" .
"* 0xA490 - 0xA4CF Yi Radicals\n"],
["vai_Vaii_LR", "* 0xA500 - 0xA63F Vai\n"],
["ko_KR", "* 0x3130 - 0x318F Hangul Compatibility Jamo (partial)\n" .
"* 0xA960 - 0xA97F Hangul Jamo Extended-A\n" .
"* 0xAC00 - 0xA7A3 Hangul Syllables (partial)\n" .
"* 0xFF00 - 0xFFEF Halfwidth and Fullwidth Forms (partial)\n"],
);
# ["zh_Hans_CN", "* 0x2E80 - 0x2EFF CJK Radicals Supplement\n" .
# "* 0x2F00 - 0x2FDF Rangxi Radicales\n" .
# "* 0x3000 - 0x30FF CJK Symbols and Punctuation (partial)\n" .
# "* 0x3200 - 0x32FF Enclosed CJK Letters and Months (partial)\n" .
# "* 0x3400 - 0x4DB5 CJK Unified Ideographs Extension A\n" .
# "* 0xF900 - 0xFAFF CJK Compatibility Ideographs (partial)\n"],
my %seen = ();
my %pending_seen = ();
my %utf8map = ();
my %utf8aliases = ();
my $outfilename = "$ETCDIR/common.UTF-8.src";
my $manual_file = "$ETCDIR/manual-input.UTF-8";
my $stars = "**********************************************************************\n";
my $outfilename = "$UNIDIR/posix/xx_Comm_C.UTF-8.src";
get_utf8map("$UNIDIR/posix/UTF-8.cm");
generate_header ();
generate_sections ();
parse_unidata ("$UNIDIR/UnicodeData.txt");
generate_footer ();
############################
@ -134,8 +33,6 @@ sub get_utf8map {
close(FIN);
chomp(@lines);
my $prev_k = undef;
my $prev_v = "";
my $incharmap = 0;
foreach my $l (@lines) {
$l =~ s/\r//;
@ -150,17 +47,11 @@ sub get_utf8map {
next if (!$incharmap);
last if ($l eq "END CHARMAP");
$l =~ /^<([^\s]+)>\s+(.*)/;
my $k = $1;
my $v = $2;
$k =~ s/_/ /g; # unicode char string
$v =~ s/\\x//g; # UTF-8 char code
$l =~ /^(<[^\s]+>)\s+(.*)/;
my $k = $2;
my $v = $1;
$k =~ s/\\x//g; # UTF-8 char code
$utf8map{$k} = $v;
$utf8aliases{$k} = $prev_k if ($prev_v eq $v);
$prev_v = $v;
$prev_k = $k;
}
}
@ -185,190 +76,117 @@ sub generate_footer {
close (FOUT);
}
sub already_seen {
my $ucode = shift;
if (defined $seen{$ucode}) {
return 1;
sub wctomb {
my $wc = hex(shift);
my $lead;
my $len;
my $ret = "";
my $i;
if (($wc & ~0x7f) == 0) {
return sprintf "%02X", $wc;
} elsif (($wc & ~0x7ff) == 0) {
$lead = 0xc0;
$len = 2;
} elsif (($wc & ~0xffff) == 0) {
$lead = 0xe0;
$len = 3;
} elsif ($wc >= 0 && $wc <= 0x10ffff) {
$lead = 0xf0;
$len = 4;
}
$pending_seen{$ucode} = 1;
return 0;
for ($i = $len - 1; $i > 0; $i--) {
$ret = (sprintf "%02X", ($wc & 0x3f) | 0x80) . $ret;
$wc >>= 6;
}
$ret = (sprintf "%02X", ($wc & 0xff) | $lead) . $ret;
return $ret;
}
sub already_seen_RO {
my $ucode = shift;
if (defined $seen{$ucode}) {
return 1;
}
return 0;
}
sub parse_unidata {
my $file = shift;
my %data = ();
sub merge_seen {
foreach my $sn (keys %pending_seen) {
$seen{$sn} = 1;
}
%pending_seen = ();
}
sub initialize_lines {
my @result = ();
my $terr = shift;
my $n;
my $back2hex;
my @types = ("graph", "alpha");
if ($terr eq "ja_JP") {
foreach my $T (@types) {
push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-3400>;/\n";
for ($n = hex("3401"); $n <= hex("4DB4"); $n++) {
$back2hex=sprintf("%X", $n);
push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" .
$back2hex . ">;/\n";
}
push @result, "\t<CJK_UNIFIED_IDEOGRAPH-4DB5>\n";
push @result, "$T\t<CJK_UNIFIED_IDEOGRAPH-4E00>;/\n";
for ($n = hex("4E01"); $n <= hex("9FCB"); $n++) {
$back2hex=sprintf("%X", $n);
push @result, "\t<CJK_UNIFIED_IDEOGRAPH-" .
$back2hex . ">;/\n";
}
push @result, "\t<CJK_UNIFIED_IDEOGRAPH-9FCC>\n";
}
push @result, "merge\tnow\n";
}
return @result;
}
sub compress_ctype {
my $territory = shift;
my $term;
my $active = 0;
my $cat_loaded = 0;
my $lock_ID;
my $prev_ID;
my $curr_ID;
my $lock_name;
my $prev_name;
my $curr_name;
my $key_name;
my $category = '';
my @lines = initialize_lines ($territory);
my $filename = "$UNIDIR/posix/$territory.UTF-8.src";
if (! -f $filename) {
print STDERR "Cannot open $filename\n";
return;
}
open(FIN, "$filename");
print "Reading from $filename\n";
while (<FIN>) {
if (/^LC_CTYPE/../^END LC_CTYPE/) {
if ($_ ne "LC_CTYPE\n" && $_ ne "END LC_CTYPE\n" &&
$_ ne "*************\n" && $_ ne "\n") {
push @lines, $_;
}
}
}
open(FIN, $file);
my @lines = <FIN>;
close(FIN);
foreach my $line (@lines) {
if ($line =~ m/^([a-z]{3,})\t/) {
$category = $1;
if ($category eq 'merge') {
merge_seen;
next;
}
if ($category ne 'print') {
$cat_loaded = 1;
}
chomp(@lines);
foreach my $l (@lines) {
my @d = split(/;/, $l, -1);
my $mb = wctomb($d[0]);
my $cat;
# XXX There are code points present in UnicodeData.txt
# and missing from UTF-8.cm
next if !defined $utf8map{$mb};
# Define the category
if ($d[2] =~ /^Lu/) {
$cat = "upper";
} elsif ($d[2] =~ /^Ll/) {
$cat = "lower";
} elsif ($d[2] =~ /^Nd/) {
$cat = "digit";
} elsif ($d[2] =~ /^L/) {
$cat = "alpha";
} elsif ($d[2] =~ /^P/) {
$cat = "punct";
} elsif ($d[2] =~ /^M/ || $d[2] =~ /^N/ || $d[2] =~ /^S/) {
$cat = "graph";
} elsif ($d[2] =~ /^C/) {
$cat = "cntrl";
} elsif ($d[2] =~ /^Z/) {
$cat = "space";
}
next if ($category eq 'print');
if ($category eq 'toupper' || $category eq 'tolower') {
if ($line =~ m/<([-_A-Za-z0-9]+)>,/) {
$key_name = $1;
$key_name =~ s/_/ /g;
if (already_seen_RO (hex($utf8map{$key_name}))) {
next;
}
if ($cat_loaded) { print FOUT $category; }
$cat_loaded = 0;
$line =~ s/^[a-z]{3,}\t/\t/;
print FOUT $line;
}
next;
$data{$cat}{$mb}{'wc'} = $d[0];
# Check if it's a start or end of range
if ($d[1] =~ /First>$/) {
$data{$cat}{$mb}{'start'} = 1;
} elsif ($d[1] =~ /Last>$/) {
$data{$cat}{$mb}{'end'} = 1;
}
if ($line =~ m/<([-_A-Za-z0-9]+)>(;.|)$/) {
$term = ($2 eq '') ? 1 : 0;
$curr_name = $1;
$key_name = $1;
$key_name =~ s/_/ /g;
$curr_ID = hex($utf8map{$key_name});
if (already_seen ($curr_ID)) {
next;
}
if ($active) {
if ($curr_ID == $prev_ID + 1) {
$prev_ID = $curr_ID;
$prev_name = $curr_name;
} else {
if ($cat_loaded) { print FOUT $category; }
$cat_loaded = 0;
if ($prev_ID == $lock_ID) {
print FOUT "\t<" . $prev_name . ">;/\n";
} elsif ($prev_ID - 1 == $lock_ID) {
print FOUT "\t<" . $lock_name . ">;/\n";
print FOUT "\t<" . $prev_name . ">;/\n";
} else {
print FOUT "\t<" . $lock_name .
">;...;<" . $prev_name . ">;/\n";
}
$lock_ID = $curr_ID;
$prev_ID = $curr_ID;
$lock_name = $curr_name;
$prev_name = $curr_name;
}
} else {
$active = 1;
$lock_ID = $curr_ID;
$prev_ID = $curr_ID;
$lock_name = $curr_name;
$prev_name = $curr_name;
}
if ($term) {
if ($cat_loaded) { print FOUT $category; }
$cat_loaded = 0;
if ($curr_ID == $lock_ID) {
print FOUT "\t<" . $curr_name . ">\n";
} elsif ($curr_ID == $lock_ID + 1) {
print FOUT "\t<" . $lock_name . ">;/\n";
print FOUT "\t<" . $curr_name . ">\n";
} else {
print FOUT "\t<" . $lock_name .
">;...;<" . $curr_name . ">\n";
}
$active = 0;
}
# Check if there's upper/lower mapping
if ($d[12] ne "") {
$data{'toupper'}{$mb} = wctomb($d[12]);
} elsif ($d[13] ne "") {
$data{'tolower'}{$mb} = wctomb($d[13]);
}
}
my $first;
my $inrange = 0;
# Now write out the categories
foreach my $cat (sort keys (%data)) {
print FOUT "$cat\t";
$first = 1;
foreach my $mb (sort keys (%{$data{$cat}})) {
if ($first == 1) {
$first = 0;
} elsif ($inrange == 1) {
# Safety belt
die "broken range end wc=$data{$cat}{$mb}{'wc'}"
if !defined $data{$cat}{$mb}{'end'};
print FOUT ";...;";
$inrange = 0;
} else {
print FOUT $line;
print FOUT ";/\n\t";
}
if ($cat eq "tolower" || $cat eq "toupper") {
print FOUT "($utf8map{$mb},$utf8map{$data{$cat}{$mb}})";
} else {
if (defined($data{$cat}{$mb}{'start'})) {
$inrange = 1;
}
print FOUT "$utf8map{$mb}";
}
}
}
sub generate_sections {
foreach my $section (@SECTIONS ) {
print FOUT "\n";
print FOUT $stars;
print FOUT @$section[1];
print FOUT $stars;
compress_ctype (@$section[0]);
merge_seen;
}
my @lines = ();
open(FIN, "$manual_file");
print "Reading from $manual_file\n";
while (<FIN>) {
push @lines, $_;
}
close(FIN);
foreach my $line (@lines) {
print FOUT $line;
}
}