freebsd-dev/contrib/perl5/lib/unicode/mktables.PL
2000-06-25 11:04:01 +00:00

340 lines
8.9 KiB
Perl
Executable File

#!../../miniperl
$UnicodeData = "Unicode.300";
# Note: we try to keep filenames unique within first 8 chars. Using
# subdirectories for the following helps.
mkdir "In", 0777;
mkdir "Is", 0777;
mkdir "To", 0777;
@todo = (
# typical
['IsWord', '$cat =~ /^L[ulo]|^Nd/ or $code eq "005F"', ''],
['IsAlnum', '$cat =~ /^L[ulo]|^Nd/', ''],
['IsAlpha', '$cat =~ /^L[ulo]/', ''],
['IsSpace', '$cat =~ /^Z/ or $code lt "0020" and chr(hex $code) =~ /^\s/', ''],
['IsDigit', '$cat =~ /^Nd$/', ''],
['IsUpper', '$cat =~ /^Lu$/', ''],
['IsLower', '$cat =~ /^Ll$/', ''],
['IsASCII', 'hex $code <= 127', ''],
['IsCntrl', '$cat =~ /^C/', ''],
['IsGraph', '$cat =~ /^[^C]/ and $code ne "0020"', ''],
['IsPrint', '$cat =~ /^[^C]/', ''],
['IsPunct', '$cat =~ /^P/', ''],
['IsXDigit', '$code =~ /^00(3[0-9]|[46][1-6])$/', ''],
['ToUpper', '$up', '$up'],
['ToLower', '$down', '$down'],
['ToTitle', '$title', '$title'],
['ToDigit', '$dec ne ""', '$dec'],
# Name
['Name', '$name', '$name'],
# Category
['Category', '$cat', '$cat'],
# Normative
['IsM', '$cat =~ /^M/', ''], # Mark
['IsMn', '$cat eq "Mn"', ''], # Mark, Non-Spacing
['IsMc', '$cat eq "Mc"', ''], # Mark, Combining
['IsN', '$cat =~ /^N/', ''], # Number
['IsNd', '$cat eq "Nd"', ''], # Number, Decimal Digit
['IsNo', '$cat eq "No"', ''], # Number, Other
['IsZ', '$cat =~ /^Z/', ''], # Zeparator
['IsZs', '$cat eq "Zs"', ''], # Separator, Space
['IsZl', '$cat eq "Zl"', ''], # Separator, Line
['IsZp', '$cat eq "Zp"', ''], # Separator, Paragraph
['IsC', '$cat =~ /^C/', ''], # Crazy
['IsCc', '$cat eq "Cc"', ''], # Other, Control or Format
['IsCo', '$cat eq "Co"', ''], # Other, Private Use
['IsCn', '$cat eq "Cn"', ''], # Other, Not Assigned
# Informative
['IsL', '$cat =~ /^L/', ''], # Letter
['IsLu', '$cat eq "Lu"', ''], # Letter, Uppercase
['IsLl', '$cat eq "Ll"', ''], # Letter, Lowercase
['IsLt', '$cat eq "Lt"', ''], # Letter, Titlecase
['IsLm', '$cat eq "Lm"', ''], # Letter, Modifier
['IsLo', '$cat eq "Lo"', ''], # Letter, Other
['IsP', '$cat =~ /^P/', ''], # Punctuation
['IsPd', '$cat eq "Pd"', ''], # Punctuation, Dash
['IsPs', '$cat eq "Ps"', ''], # Punctuation, Open
['IsPe', '$cat eq "Pe"', ''], # Punctuation, Close
['IsPo', '$cat eq "Po"', ''], # Punctuation, Other
['IsS', '$cat =~ /^S/', ''], # Symbol
['IsSm', '$cat eq "Sm"', ''], # Symbol, Math
['IsSc', '$cat eq "Sc"', ''], # Symbol, Currency
['IsSo', '$cat eq "So"', ''], # Symbol, Other
# Combining class
['CombiningClass', '$comb', '$comb'],
# BIDIRECTIONAL PROPERTIES
['Bidirectional', '$bid', '$bid'],
# Strong types:
['IsBidiL', '$bid eq "L"', ''], # Left-Right; Most alphabetic,
# syllabic, and logographic
# characters (e.g., CJK
# ideographs)
['IsBidiR', '$bid eq "R"', ''], # Right-Left; Arabic, Hebrew,
# and punctuation specific to
# those scripts
# Weak types:
['IsBidiEN','$bid eq "EN"', ''], # European Number
['IsBidiES','$bid eq "ES"', ''], # European Number Separator
['IsBidiET','$bid eq "ET"', ''], # European Number Terminator
['IsBidiAN','$bid eq "AN"', ''], # Arabic Number
['IsBidiCS','$bid eq "CS"', ''], # Common Number Separator
# Separators:
['IsBidiB', '$bid eq "B"', ''], # Block Separator
['IsBidiS', '$bid eq "S"', ''], # Segment Separator
# Neutrals:
['IsBidiWS','$bid eq "WS"', ''], # Whitespace
['IsBidiON','$bid eq "ON"', ''], # Other Neutrals ; All other
# characters: punctuation,
# symbols
# Decomposition
['Decomposition', '$decomp', '$decomp'],
['IsDecoCanon', '$decomp && $decomp !~ /^</', ''],
['IsDecoCompat', '$decomp =~ /^</', ''],
['IsDCfont', '$decomp =~ /^<font>/', ''],
['IsDCnoBreak', '$decomp =~ /^<noBreak>/', ''],
['IsDCinitial', '$decomp =~ /^<initial>/', ''],
['IsDCinital', '$decomp =~ /^<medial>/', ''],
['IsDCfinal', '$decomp =~ /^<final>/', ''],
['IsDCisolated', '$decomp =~ /^<isolated>/', ''],
['IsDCcircle', '$decomp =~ /^<circle>/', ''],
['IsDCsuper', '$decomp =~ /^<super>/', ''],
['IsDCsub', '$decomp =~ /^<sub>/', ''],
['IsDCvertical', '$decomp =~ /^<vertical>/', ''],
['IsDCwide', '$decomp =~ /^<wide>/', ''],
['IsDCnarrow', '$decomp =~ /^<narrow>/', ''],
['IsDCsmall', '$decomp =~ /^<small>/', ''],
['IsDCsquare', '$decomp =~ /^<square>/', ''],
['IsDCcompat', '$decomp =~ /^<compat>/', ''],
# Number
['Number', '$num', '$num'],
# Mirrored
['IsMirrored', '$mir eq "Y"', ''],
# Arabic
['ArabLink', '1', '$link'],
['ArabLnkGrp', '1', '$linkgroup'],
# Jamo
['JamoShort', '1', '$short'],
# Syllables
['IsSylV', '$syl eq "V"', ''],
['IsSylU', '$syl eq "U"', ''],
['IsSylI', '$syl eq "I"', ''],
['IsSylA', '$syl eq "A"', ''],
['IsSylE', '$syl eq "E"', ''],
['IsSylC', '$syl eq "C"', ''],
['IsSylO', '$syl eq "O"', ''],
['IsSylWV', '$syl eq "V"', ''],
['IsSylWI', '$syl eq "I"', ''],
['IsSylWA', '$syl eq "A"', ''],
['IsSylWE', '$syl eq "E"', ''],
['IsSylWC', '$syl eq "C"', ''],
);
# This is not written for speed...
foreach $file (@todo) {
my ($table, $wanted, $val) = @$file;
next if @ARGV and not grep { $_ eq $table } @ARGV;
print $table,"\n";
if ($table =~ /^(Is|In|To)(.*)/) {
open(OUT, ">$1/$2.pl") or die "Can't create $1/$2.pl: $!\n";
}
else {
open(OUT, ">$table.pl") or die "Can't create $table.pl: $!\n";
}
print OUT <<EOH;
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by $0 from e.g. $UnicodeData.
# Any changes made here will be lost!
EOH
print OUT <<"END";
return <<'END';
END
print OUT proplist($table, $wanted, $val);
print OUT "END\n";
close OUT;
}
# Must treat blocks specially.
exit if @ARGV and not grep { $_ eq Block } @ARGV;
print "Block\n";
open(UD, 'Blocks.txt') or die "Can't open blocks.txt: $!\n";
open(OUT, ">Block.pl") or die "Can't create $table.pl: $!\n";
print OUT <<EOH;
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by $0 from e.g. $UnicodeData.
# Any changes made here will be lost!
EOH
print OUT <<"END";
return <<'END';
END
while (<UD>) {
next if /^#/;
next if /^$/;
chomp;
($code, $last, $name) = split(/; */);
if ($name) {
print OUT "$code $last $name\n";
$name =~ s/\s+//g;
open(BLOCK, ">In/$name.pl");
print BLOCK <<EOH;
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
# This file is built by $0 from e.g. $UnicodeData.
# Any changes made here will be lost!
EOH
print BLOCK <<"END2";
return <<'END';
$code $last
END
END2
close BLOCK;
}
}
print OUT "END\n";
close OUT;
##################################################
sub proplist {
my ($table, $wanted, $val) = @_;
my @wanted;
my $out;
my $split;
if ($table =~ /^Arab/) {
open(UD, "ArabShap.txt") or warn "Can't open $table: $!";
$split = '($code, $name, $link, $linkgroup) = split(/; */);';
}
elsif ($table =~ /^Jamo/) {
open(UD, "Jamo.txt") or warn "Can't open $table: $!";
$split = '($code, $short, $name) = split(/; */); $code =~ s/^U\+//;';
}
elsif ($table =~ /^IsSyl/) {
open(UD, "syllables.txt") or warn "Can't open $table: $!";
$split = '($code, $short, $syl) = split(/; */); $code =~ s/^U\+//;';
}
else {
open(UD, $UnicodeData) or warn "Can't open $UnicodeData: $!";
$split = '($code, $name, $cat, $comb, $bid, $decomp, $dec, $dig, $num, $mir, $uni1,
$comment, $up, $down, $title) = split(/;/);';
}
if ($table =~ /^(?:To|Is)[A-Z]/) {
eval <<"END";
while (<UD>) {
next if /^#/;
next if /^\s/;
chop;
$split
if ($wanted) {
push(\@wanted, [hex \$code, hex $val, \$name =~ /, First>\$/]);
}
}
END
die $@ if $@;
while (@wanted) {
$beg = shift @wanted;
$last = $beg;
while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
(not $val or $wanted[0]->[1] == $last->[1] + 1)) {
$last = shift @wanted;
}
$out .= sprintf "%04x", $beg->[0];
if ($beg->[2]) {
$last = shift @wanted;
}
if ($beg == $last) {
$out .= "\t";
}
else {
$out .= sprintf "\t%04x", $last->[0];
}
$out .= sprintf "\t%04x", $beg->[1] if $val;
$out .= "\n";
}
}
else {
eval <<"END";
while (<UD>) {
next if /^#/;
next if /^\s*\$/;
chop;
$split
if ($wanted) {
push(\@wanted, [hex \$code, $val, \$name =~ /, First>\$/]);
}
}
END
die $@ if $@;
while (@wanted) {
$beg = shift @wanted;
$last = $beg;
while (@wanted and $wanted[0]->[0] == $last->[0] + 1 and
($wanted[0]->[1] eq $last->[1])) {
$last = shift @wanted;
}
$out .= sprintf "%04x", $beg->[0];
if ($beg->[2]) {
$last = shift @wanted;
}
if ($beg == $last) {
$out .= "\t";
}
else {
$out .= sprintf "\t%04x", $last->[0];
}
$out .= sprintf "\t%s\n", $beg->[1];
}
}
$out;
}
# eof