diff --git a/tools/tools/locale/Makefile b/tools/tools/locale/Makefile index 27ff255d7f9a..92f890b2f4d3 100644 --- a/tools/tools/locale/Makefile +++ b/tools/tools/locale/Makefile @@ -168,7 +168,8 @@ ENCODINGS= Big5 \ KOI8-U \ SJIS \ US-ASCII \ - UTF-8 + UTF-8 \ + UTF-32 # CLDR files CLDRFILES_CORE= https://unicode.org/Public/cldr/35/core.zip @@ -211,9 +212,10 @@ ${UNIDIR}/posix: ln -s -f ../posix ${.TARGET} clean-posix: rm -rf posix ${UNIDIR}/posix -post-posixcm: ${UNIDIR}/posix +${UNIDIR}/posix/xx_Comm_C.UTF-8.src: ${UNIDIR}/posix perl -I ${TOOLSDIR} ${TOOLSDIR}/utf8-rollup.pl \ --unidir=${UNIDIR} +post-posixcm: ${UNIDIR}/posix/xx_Comm_C.UTF-8.src .for enc in ${ENCODINGS} posixcm: build-tools posix/${enc}.cm .ORDER: build-tools posix/${enc}.cm diff --git a/tools/tools/locale/README b/tools/tools/locale/README index 0b5ce24b51cd..380786929b7c 100644 --- a/tools/tools/locale/README +++ b/tools/tools/locale/README @@ -19,7 +19,7 @@ More details are as follows: Variables: LOCALESRCDIR Destination path for the generated locale files. - Default: $DESTDIR/usr/src/share. + Default: ${SRCTOP}/share. TMPDIR Temporary directory. Default: /tmp @@ -29,7 +29,12 @@ Targets: Create a temporary directory for building. make clean - Clean up the obj directories. + Clean up the obj directories. Note that this does not + clean up tools or posix locale source files generated + from the CLDR files because it takes a long time to generate + them and they are not changed as long as using the same + CLDR files. "make clean && make build" will + regenerate the locale source files for src/share/*def. make cleandir Remove the obj directories completely. diff --git a/tools/tools/locale/etc/charmaps.xml b/tools/tools/locale/etc/charmaps.xml index 78a344d6929e..52e80f2dee05 100644 --- a/tools/tools/locale/etc/charmaps.xml +++ b/tools/tools/locale/etc/charmaps.xml @@ -195,395 +195,404 @@ + - - - - + + + + + cldr="MINUS_SIGN" unicode="HYPHEN-MINUS" /> + cldr="EN_DASH" unicode="HYPHEN-MINUS" /> + cldr="CYRILLIC_CAPITAL_LETTER_JE" + unicode="LATIN_CAPITAL_LETTER_J" /> + cldr="CYRILLIC_SMALL_LETTER_JE" unicode="LATIN_SMALL_LETTER_J" /> + cldr="CYRILLIC_CAPITAL_LETTER_LJE" string="lj" /> + cldr="CYRILLIC_SMALL_LETTER_LJE" string="lj" /> + cldr="CYRILLIC_CAPITAL_LETTER_A" unicode="LATIN_CAPITAL_LETTER_A" /> + cldr="CYRILLIC_SMALL_LETTER_A" unicode="LATIN_SMALL_LETTER_A" /> + cldr="CYRILLIC_CAPITAL_LETTER_BE" + unicode="LATIN_CAPITAL_LETTER_B" /> + cldr="CYRILLIC_SMALL_LETTER_BE" unicode="LATIN_SMALL_LETTER_B" /> + cldr="CYRILLIC_CAPITAL_LETTER_VE" + unicode="LATIN_CAPITAL_LETTER_B" /> + cldr="CYRILLIC_SMALL_LETTER_VE" unicode="LATIN_SMALL_LETTER_B" /> + cldr="CYRILLIC_CAPITAL_LETTER_GHE" + unicode="LATIN_CAPITAL_LETTER_G" /> + cldr="CYRILLIC_SMALL_LETTER_GHE" unicode="LATIN_SMALL_LETTER_G" /> + cldr="CYRILLIC_CAPITAL_LETTER_DE" string="D" /> + cldr="CYRILLIC_SMALL_LETTER_DE" string="d" /> + cldr="CYRILLIC_CAPITAL_LETTER_IE" + unicode="LATIN_CAPITAL_LETTER_E" /> + cldr="CYRILLIC_SMALL_LETTER_IE" unicode="LATIN_SMALL_LETTER_E" /> + cldr="CYRILLIC_CAPITAL_LETTER_ZHE" string="ZH" /> + cldr="CYRILLIC_SMALL_LETTER_ZHE" string="zh" /> + cldr="CYRILLIC_CAPITAL_LETTER_ZE" string="z" /> + cldr="CYRILLIC_SMALL_LETTER_ZE" string="z" /> + cldr="CYRILLIC_CAPITAL_LETTER_I" unicode="LATIN_CAPITAL_LETTER_J" /> + cldr="CYRILLIC_SMALL_LETTER_I" unicode="LATIN_CAPITAL_LETTER_J" /> + cldr="CYRILLIC_CAPITAL_LETTER_I" unicode="LATIN_SMALL_LETTER_J" /> + cldr="CYRILLIC_SMALL_LETTER_I" unicode="LATIN_SMALL_LETTER_J" /> + cldr="CYRILLIC_CAPITAL_LETTER_KA" + unicode="LATIN_CAPITAL_LETTER_K" /> + cldr="CYRILLIC_SMALL_LETTER_KA" unicode="LATIN_SMALL_LETTER_K" /> + cldr="CYRILLIC_CAPITAL_LETTER_EL" + unicode="LATIN_CAPITAL_LETTER_L" /> + cldr="CYRILLIC_SMALL_LETTER_EL" unicode="LATIN_SMALL_LETTER_L" /> + cldr="CYRILLIC_CAPITAL_LETTER_EM" + unicode="LATIN_CAPITAL_LETTER_M" /> + cldr="CYRILLIC_SMALL_LETTER_EM" unicode="LATIN_SMALL_LETTER_M" /> + cldr="CYRILLIC_CAPITAL_LETTER_EN" + unicode="LATIN_CAPITAL_LETTER_H" /> + cldr="CYRILLIC_SMALL_LETTER_EN" unicode="LATIN_SMALL_LETTER_H" /> + cldr="CYRILLIC_CAPITAL_LETTER_O" unicode="LATIN_CAPITAL_LETTER_O" /> + cldr="CYRILLIC_SMALL_LETTER_O" unicode="LATIN_SMALL_LETTER_O" /> + cldr="CYRILLIC_CAPITAL_LETTER_PE" + unicode="LATIN_CAPITAL_LETTER_P" /> + cldr="CYRILLIC_SMALL_LETTER_PE" unicode="LATIN_SMALL_LETTER_P" /> + cldr="CYRILLIC_CAPITAL_LETTER_ER" + unicode="LATIN_CAPITAL_LETTER_R" /> + cldr="CYRILLIC_SMALL_LETTER_ER" unicode="LATIN_SMALL_LETTER_R" /> + cldr="CYRILLIC_CAPITAL_LETTER_ES" + unicode="LATIN_CAPITAL_LETTER_C" /> + cldr="CYRILLIC_SMALL_LETTER_ES" unicode="LATIN_SMALL_LETTER_C" /> + cldr="CYRILLIC_CAPITAL_LETTER_TE" + unicode="LATIN_CAPITAL_LETTER_T" /> + cldr="CYRILLIC_SMALL_LETTER_TE" unicode="LATIN_SMALL_LETTER_T" /> + cldr="CYRILLIC_CAPITAL_LETTER_U" unicode="LATIN_CAPITAL_LETTER_U" /> + cldr="CYRILLIC_SMALL_LETTER_U" unicode="LATIN_SMALL_LETTER_U" /> + cldr="CYRILLIC_CAPITAL_LETTER_EF" + unicode="LATIN_CAPITAL_LETTER_F" /> + cldr="CYRILLIC_SMALL_LETTER_EF" unicode="LATIN_SMALL_LETTER_F" /> + cldr="CYRILLIC_CAPITAL_LETTER_HA" + unicode="LATIN_CAPITAL_LETTER_H" /> + cldr="CYRILLIC_SMALL_LETTER_HA" unicode="LATIN_SMALL_LETTER_H" /> + cldr="CYRILLIC_CAPITAL_LETTER_TSE" + unicode="LATIN_CAPITAL_LETTER_C" /> + cldr="CYRILLIC_SMALL_LETTER_TSE" unicode="LATIN_SMALL_LETTER_C" /> + cldr="CYRILLIC_CAPITAL_LETTER_CHE" + unicode="LATIN_CAPITAL_LETTER_C_WITH_CARON" /> + cldr="CYRILLIC_SMALL_LETTER_CHE" + unicode="LATIN_SMALL_LETTER_C_WITH_CARON" /> + cldr="CYRILLIC_CAPITAL_LETTER_SHA" + unicode="LATIN_CAPITAL_LETTER_S_WITH_CARON" /> + cldr="CYRILLIC_SMALL_LETTER_SHA" + unicode="LATIN_SMALL_LETTER_S_WITH_CARON" /> + cldr="CYRILLIC_CAPITAL_LETTER_SHCHA" + unicode="LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX" /> + cldr="CYRILLIC_SMALL_LETTER_SHCHA" + unicode="LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX" /> + cldr="?CYRILLIC_CAPITAL_LETTER_HARD_SIGN" unicode="?" /> + cldr="?CYRILLIC_SMALL_LETTER_HARD_SIGN" unicode="?" /> + cldr="?CYRILLIC_CAPITAL_LETTER_YERU" unicode="?" /> + cldr="?CYRILLIC_SMALL_LETTER_YERU" unicode="?" /> + cldr="?CYRILLIC_CAPITAL_LETTER_SOFT_SIGN" unicode="?" /> + cldr="?CYRILLIC_SMALL_LETTER_SOFT_SIGN" unicode="?" /> + cldr="CYRILLIC_CAPITAL_LETTER_E" + unicode="LATIN_CAPITAL_LETTER_E_WITH_GRAVE" /> + cldr="CYRILLIC_SMALL_LETTER_E" + unicode="LATIN_SMALL_LETTER_E_WITH_GRAVE" /> + cldr="?CYRILLIC_CAPITAL_LETTER_YU" unicode="?" /> + cldr="?CYRILLIC_SMALL_LETTER_YU" unicode="?" /> + cldr="CYRILLIC_CAPITAL_LETTER_YA" + unicode="LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX" /> + cldr="CYRILLIC_SMALL_LETTER_YA" + unicode="LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX" /> + cldr="LATIN_SMALL_LETTER_T_WITH_COMMA_BELOW" + unicode="LATIN_SMALL_LETTER_T" /> + cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" /> + cldr="LATIN_SMALL_LETTER_C_WITH_CARON" + unicode="LATIN_SMALL_LETTER_C" /> + cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" /> + cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" /> - - - - - - - + + + + + + + - - + + + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_C" /> + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_D" /> + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_N" /> + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_T" /> + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_W" /> + unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_Y" /> + unicode="FULLWIDTH_DIGIT_ONE" /> + unicode="FULLWIDTH_DIGIT_TWO" /> + unicode="FULLWIDTH_DIGIT_THREE" /> + unicode="FULLWIDTH_DIGIT_FOUR" /> + unicode="FULLWIDTH_DIGIT_FIVE" /> + unicode="FULLWIDTH_DIGIT_SIX" /> + unicode="FULLWIDTH_DIGIT_SEVEN" /> + unicode="FULLWIDTH_DIGIT_EIGHT" /> + unicode="FULLWIDTH_DIGIT_NINE" /> + unicode="FULLWIDTH_DIGIT_ZERO" /> - + unicode="IDEOGRAPHIC_SPACE" /> + + unicode="FULLWIDTH_SOLIDUS" /> + unicode="FULLWIDTH_COMMA" /> - + unicode="FULLWIDTH_HYPHEN-MINUS" /> + + cldr="CJK_UNIFIED_IDEOGRAPH-4E00" ucc="4E00" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E03" ucc="4E03" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E09" ucc="4E09" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E0A" ucc="4E0A" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E0B" ucc="4E0B" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E0D" ucc="4E0D" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E5D" ucc="4E5D" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E8C" ucc="4E8C" /> + cldr="CJK_UNIFIED_IDEOGRAPH-4E94" ucc="4E94" /> + cldr="CJK_UNIFIED_IDEOGRAPH-516B" ucc="516B" /> + cldr="CJK_UNIFIED_IDEOGRAPH-516D" ucc="516D" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5206" ucc="5206" /> + cldr="CJK_UNIFIED_IDEOGRAPH-524D" ucc="524D" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5341" ucc="5341" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5348" ucc="5348" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5426" ucc="5426" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5468" ucc="5468" /> + cldr="CJK_UNIFIED_IDEOGRAPH-56DB" ucc="56DB" /> + cldr="CJK_UNIFIED_IDEOGRAPH-571F" ucc="571F" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5B9A" ucc="5B9A" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5E74" ucc="5E74" /> + cldr="CJK_UNIFIED_IDEOGRAPH-5F8C" ucc="5F8C" /> + cldr="CJK_UNIFIED_IDEOGRAPH-65E5" ucc="65E5" /> + cldr="CJK_UNIFIED_IDEOGRAPH-65F6" ucc="65F6" /> + cldr="CJK_UNIFIED_IDEOGRAPH-661F" ucc="661F" /> + cldr="CJK_UNIFIED_IDEOGRAPH-662F" ucc="662F" /> + cldr="CJK_UNIFIED_IDEOGRAPH-6642" ucc="6642" /> + cldr="CJK_UNIFIED_IDEOGRAPH-66DC" ucc="66DC" /> + cldr="CJK_UNIFIED_IDEOGRAPH-6708" ucc="6708" /> + cldr="CJK_UNIFIED_IDEOGRAPH-671F" ucc="671F" /> + cldr="CJK_UNIFIED_IDEOGRAPH-6728" ucc="6728" /> + cldr="CJK_UNIFIED_IDEOGRAPH-6C34" ucc="6C34" /> + cldr="CJK_UNIFIED_IDEOGRAPH-706B" ucc="706B" /> + cldr="CJK_UNIFIED_IDEOGRAPH-786E" ucc="786E" /> + cldr="CJK_UNIFIED_IDEOGRAPH-78BA" ucc="78BA" /> + cldr="CJK_UNIFIED_IDEOGRAPH-79D2" ucc="79D2" /> + cldr="CJK_UNIFIED_IDEOGRAPH-9031" ucc="9031" /> + cldr="CJK_UNIFIED_IDEOGRAPH-91D1" ucc="91D1" /> + cldr="HANGUL_SYLLABLE_GEUM" ucc="AE08" /> + cldr="HANGUL_SYLLABLE_NYEON" ucc="B144" /> + cldr="HANGUL_SYLLABLE_NI" ucc="B2C8" /> + cldr="HANGUL_SYLLABLE_MOG" ucc="BAA9" /> + cldr="HANGUL_SYLLABLE_BUN" ucc="BD84" /> + cldr="HANGUL_SYLLABLE_SU" ucc="C218" /> + cldr="HANGUL_SYLLABLE_SI" ucc="C2DC" /> + cldr="HANGUL_SYLLABLE_A" ucc="C544" /> + cldr="HANGUL_SYLLABLE_YE" ucc="C608" /> + cldr="HANGUL_SYLLABLE_O" ucc="C624" /> + cldr="HANGUL_SYLLABLE_YO" ucc="C694" /> + cldr="HANGUL_SYLLABLE_WEOL" ucc="C6D4" /> + cldr="HANGUL_SYLLABLE_IL" ucc="C77C" /> + cldr="HANGUL_SYLLABLE_JEON" ucc="C804" /> + cldr="HANGUL_SYLLABLE_CO" ucc="CD08" /> + cldr="HANGUL_SYLLABLE_TO" ucc="D1A0" /> + cldr="HANGUL_SYLLABLE_HWA" ucc="D654" /> + cldr="HANGUL_SYLLABLE_HU" ucc="D6C4" /> + cldr="ONE_DOT_LEADER" unicode="FULL_STOP" /> - + + cldr="NO-BREAK_SPACE" unicode="SPACE" /> + cldr="NARROW_NO-BREAK_SPACE" unicode="NO-BREAK_SPACE" /> + cldr="RIGHT_SINGLE_QUOTATION_MARK" unicode="APOSTROPHE" /> - - - - + + + - diff --git a/tools/tools/locale/tools/cldr2def.pl b/tools/tools/locale/tools/cldr2def.pl index 8617ca81ca40..fd475db714a0 100755 --- a/tools/tools/locale/tools/cldr2def.pl +++ b/tools/tools/locale/tools/cldr2def.pl @@ -4,6 +4,7 @@ # # Copyright 2009 Edwin Groothuis # Copyright 2015 John Marino +# Copyright 2020 Hiroki Sato # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -38,7 +39,6 @@ use Getopt::Long; use Digest::SHA qw(sha1_hex); require "charmaps.pm"; - if ($#ARGV < 2) { print "Usage: $0 --unidir= --etc= --type=\n"; exit(1); @@ -69,10 +69,11 @@ my %encodings = (); my %alternativemonths = (); get_languages(); -my %utf8map = (); -my %utf8aliases = (); -get_unidata($UNIDIR); -get_utf8map("$UNIDIR/posix/$DEFENCODING.cm"); +my %utfmap = (); +$utfmap{'UTF-8'} = {}; +$utfmap{'UTF-32'} = {}; +get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'}); +get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'}); get_encodings("$ETCDIR/charmaps"); my %keys = (); @@ -334,25 +335,8 @@ sub callback_abmon { ############################ -sub get_unidata { - my $directory = shift; - - open(FIN, "$directory/UnicodeData.txt") - or die("Cannot open $directory/UnicodeData.txt");; - my @lines = ; - chomp(@lines); - close(FIN); - - foreach my $l (@lines) { - my @a = split(/;/, $l); - - $ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name - $ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code - } -} - -sub get_utf8map { - my $file = shift; +sub get_utfmap { + my ($file, $db) = @_; open(FIN, $file); my @lines = ; @@ -363,7 +347,7 @@ sub get_utf8map { my $prev_v = ""; my $incharmap = 0; foreach my $l (@lines) { - $l =~ s/\r//; + chomp($l); next if ($l =~ /^\#/); next if ($l eq ""); @@ -378,17 +362,28 @@ sub get_utf8map { $l =~ /^<([^\s]+)>\s+(.*)/; my $k = $1; my $v = $2; - $k =~ s/_/ /g; # unicode char string $v =~ s/\\x//g; # UTF-8 char code - $utf8map{$k} = $v; + $db->{$k} = $v; +# print STDERR "UTF $k = $v\n"; - $utf8aliases{$k} = $prev_k if ($prev_v eq $v); + # XXX: no longer needed + # $db_alias->{$k} = $prev_k if ($prev_v eq $v); $prev_v = $v; $prev_k = $k; } } +sub resolve_enc_addition { + my $ret = ''; + + foreach my $t (split(/\+/, $_[0])) { + $t =~ s/^0[xX]//; + $ret .= $t; + } + return $ret; +} + sub get_encodings { my $dir = shift; foreach my $e (sort(keys(%encodings))) { @@ -403,14 +398,20 @@ sub get_encodings { chomp(@lines); foreach my $l (@lines) { $l =~ s/\r//; - next if ($l =~ /^\#/); next if ($l eq ""); my @a = split(" ", $l); next if ($#a < 1); - $a[0] =~ s/^0[xX]//; # local char code - $a[1] =~ s/^0[xX]//; # unicode char code - $convertors{$e}{uc($a[1])} = uc($a[0]); + next if ($a[0] =~ /^\#/ or $a[1] =~ /^\#/); + next if ($a[0] eq '' or $a[1] eq ''); + + $a[0] = resolve_enc_addition($a[0]); # local + $a[1] = resolve_enc_addition($a[1]); # UTF-32 + my $u32 = sprintf("%08X", hex($a[1])); +# print STDERR "$a[1] => $u32\n"; + + # Use UTF-32 as the indices. + $convertors{$e}{$u32} = uc($a[0]); } } } @@ -565,8 +566,75 @@ EOF foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) { next if ($enc eq $DEFENCODING); - copy ("$TYPE.draft/$actfile.$DEFENCODING.src", - "$TYPE.draft/$actfile.$enc.src"); + + open FIN, "<$TYPE.draft/$actfile.$DEFENCODING.src"; + open FOUT, ">$TYPE.draft/$actfile.$enc.src"; + my $order_start = 0; + my $print_p = 0; + # + # %c_elem: collation elements + # + # undef: not defined + # 1: defined + # 2: invalid in this encoding + # + my %c_elem = (); + while () { # XXX: this loop should be refactored. + chomp; + $print_p = 1; + if ($order_start) { + $order_start = 0 if (m/^order_end/); + if (m/^<([^>]+)>/) { + if (not defined $c_elem{$1}) { +# print STDERR "$1:\n"; + + my $u32 = $utfmap{'UTF-32'}->{$1}; + die "order, $1\n" if (not defined $u32); +# print STDERR "u32 for $1 = $u32\n"; + if (not defined $convertors{$enc}{$u32}) { +# print STDERR "$1 - $u32 not defined in $enc\n"; + $print_p = 0; + } + } elsif ($c_elem{$1} == 2) { +# print STDERR "$1 is marked as invalid in $enc\n"; + $print_p = 0; + } + } + } elsif (m/^collating-element/) { + my ($elem, $l); + if (m/<([^>]+)> from (.+)/) { + ($elem, $l) = ($1, $2); + } +# print STDERR "$elem: enter ($print_p, $l,)\n"; + while ($print_p and + defined $l and + $l =~ m/<([^>]+)>/g) { +# print STDERR "$elem: $1\n"; + my $u32 = $utfmap{'UTF-32'}->{$1}; + die "collating-element, $1\n" if (not defined $u32); +# print STDERR "u32 for $1 = $u32\n"; + if (not $convertors{$enc}{$u32}) { +# print STDERR "$1 - $u32 not defined in $enc\n"; + $print_p = 0; +# print STDERR "Mark $elem as invalid\n"; + $c_elem{$elem} = 2; + } + } + if ($print_p) { +# print STDERR "Add $elem\n"; + $c_elem{$elem} = 1; + } + } elsif (m/^collating-symbol <([^>]+)>/) { +# print STDERR "Add $1\n"; + $c_elem{$1} = 1; + } elsif (m/^order_start/) { + $order_start = 1; + # do nothing + } + print FOUT $_, "\n" if ($print_p); + } + close FOUT; + close FIN; $languages{$l}{$f}{data}{$c}{$enc} = $shex; $hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1; } @@ -626,11 +694,11 @@ sub get_fields { $continue = ($line =~ /\/$/); $line =~ s/\/$// if ($continue); - while ($line =~ /_/) { - $line =~ - s/\<([^>_]+)_([^>]+)\>/<$1 $2>/; - } - die "_ in data - $line" if ($line =~ /_/); +# while ($line =~ /_/) { +# $line =~ +# s/\<([^>_]+)_([^>]+)\>/<$1 $2>/; +# } +# die "_ in data - $line" if ($line =~ /_/); $values{$l}{$f}{$c}{$k} .= $line; last if (!$continue); @@ -652,56 +720,52 @@ sub decodecldr { # Conversion to UTF-8 can be done from the Unicode name to # the UTF-8 character code. # - $v = $utf8map{$s}; + $v = $utfmap{'UTF-8'}->{$s}; die "Cannot convert $s in $e (charmap)" if (!defined $v); } else { # # Conversion to these encodings can be done from the Unicode # name to Unicode code to the encodings code. # - my $ucc = undef; - $ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s}); - $ucc = $ucd{name2code}{$utf8aliases{$s}} - if (!defined $ucc - && $utf8aliases{$s} - && defined $ucd{name2code}{$utf8aliases{$s}}); + # hex - hex or string attr + # unicode - unicode attr + # ucc - ucc attr + my $hex = $translations{$e}{$s}{hex}; + my $ucc = $utfmap{'UTF-32'}->{$s}; + my $ucc_attr = $translations{$e}{$s}{ucc}; + my $unicode = $translations{$e}{$s}{unicode}; - if (!defined $ucc) { - if (defined $translations{$e}{$s}{hex}) { - $v = $translations{$e}{$s}{hex}; - $ucc = 0; - } elsif (defined $translations{$e}{$s}{ucc}) { - $ucc = $translations{$e}{$s}{ucc}; + if (defined $hex) { # hex is in local encoding + $v = $hex; + } elsif (defined $unicode) { # unicode is in name + $v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}}; + } elsif (defined $ucc_attr) { # ucc is in code point + if (defined $ucc) { +# print STDERR "INFO: ucc=$ucc_attr ", +# "overrides $ucc in UTF-32\n"; } - } - - die "Cannot convert $s in $e (ucd string)" if (!defined $ucc); - $v = $convertors{$e}{$ucc} if (!defined $v); - - $v = $translations{$e}{$s}{hex} - if (!defined $v && defined $translations{$e}{$s}{hex}); - - if (!defined $v && defined $translations{$e}{$s}{unicode}) { - my $ucn = $translations{$e}{$s}{unicode}; - $ucc = $ucd{name2code}{$ucn} - if (defined $ucd{name2code}{$ucn}); - $ucc = $ucd{name2code}{$utf8aliases{$ucn}} - if (!defined $ucc - && defined $ucd{name2code}{$utf8aliases{$ucn}}); + # normalize + $ucc_attr = sprintf("%08X", hex($ucc_attr)); +# print STDERR "convert $ucc_attr into $e\n"; + $v = $convertors{$e}{$ucc_attr}; + } elsif (defined $ucc) { + # normalize + $ucc = sprintf("%08X", hex($ucc)); +# print STDERR "convert $ucc into $e\n"; $v = $convertors{$e}{$ucc}; } - - die "Cannot convert $s in $e (charmap)" if (!defined $v); + die "Cannot convert $s in $e" if (!defined $v); } + # XXX: length = 8 is not supported yet. + $v =~ s/^[0]+//g; + $v = "0" . $v if (length($v) % 2); return pack("C", hex($v)) if (length($v) == 2); return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2))) if (length($v) == 4); return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)), hex(substr($v, 4, 2))) if (length($v) == 6); - print STDERR "Cannot convert $e $s\n"; - return "length = " . length($v); - + die "Cannot convert $s in $e (length = " . length($v) . "\n"; } sub translate {