Fix generation of colldef source files for non-UTF-8 locales

- Files for colldef were generated by duplicating UTF-8 collation files
  for each language and included invalid characters in the non-UTF-8
  encodings.  localedef(1) does not allow those characters.
  cldr2def.pl now checks if the characters are valid based on charmap files.

  TODO: ja_JP.UTF-8 locale should not be generated solely from CLDR because
  it was standardized in a document "UI-OSF Application Platform Profile for
  Japanese Environment" which was incompatible with information in CLDR.
  Most of commercial Unix vendors adopt this pre-Unicode-era document
  as the reference even for UTF-8 locale.  Newer versions of Solaris have
  added a CLDR version as ja_JP.UTF-8@cldr, and IBM AIX has used
  JA_JP.UTF-8 for the UI-OSF specification and ja_JP.UTF-8 for CLDR.

  Note that this commit does not change generation of ja_JP.UTF-8.
  Changes related to this issue will be committed separately later.

- Generate POSIX charamap UTF-32 as a reference.  It was confusing that
  charmap.xml used Unicode names defined in UnicodeData.txt though POSIX
  charmap used slightly different names for the same code points.
  cldr2def.pl now uses UTF-32.cm as single information source for Unicode
  symbol names and code points.  Charset.xml is also updated to use them.

- Fix a bug in get_encodings() in cldr2def.pl which did not understand
  0x00+0x00 notation correctly in charmaps/ISCII-DEV.TXT.

- Do not regenerate posix/xx_Comm_C.UTF-8.src every time when doing
  "make build".

Reviewed by:	bapt
Differential Revision:	https://reviews.freebsd.org/D27809
This commit is contained in:
Hiroki Sato 2020-12-30 04:21:19 +09:00
parent f3f16c31fe
commit 916806472a
4 changed files with 362 additions and 282 deletions

View File

@ -168,7 +168,8 @@ ENCODINGS= Big5 \
KOI8-U \
SJIS \
US-ASCII \
UTF-8
UTF-8 \
UTF-32
# CLDR files
CLDRFILES_CORE= https://unicode.org/Public/cldr/35/core.zip
@ -211,9 +212,10 @@ ${UNIDIR}/posix:
ln -s -f ../posix ${.TARGET}
clean-posix:
rm -rf posix ${UNIDIR}/posix
post-posixcm: ${UNIDIR}/posix
${UNIDIR}/posix/xx_Comm_C.UTF-8.src: ${UNIDIR}/posix
perl -I ${TOOLSDIR} ${TOOLSDIR}/utf8-rollup.pl \
--unidir=${UNIDIR}
post-posixcm: ${UNIDIR}/posix/xx_Comm_C.UTF-8.src
.for enc in ${ENCODINGS}
posixcm: build-tools posix/${enc}.cm
.ORDER: build-tools posix/${enc}.cm

View File

@ -19,7 +19,7 @@ More details are as follows:
Variables:
LOCALESRCDIR
Destination path for the generated locale files.
Default: $DESTDIR/usr/src/share.
Default: ${SRCTOP}/share.
TMPDIR
Temporary directory.
Default: /tmp
@ -29,7 +29,12 @@ Targets:
Create a temporary directory for building.
make clean
Clean up the obj directories.
Clean up the obj directories. Note that this does not
clean up tools or posix locale source files generated
from the CLDR files because it takes a long time to generate
them and they are not changed as long as using the same
CLDR files. "make clean && make build" will
regenerate the locale source files for src/share/*def.
make cleandir
Remove the obj directories completely.

View File

@ -195,395 +195,404 @@
</languages>
<translations>
<!--
encoding: Space-separated list of encodings
cldr: Symbol to be replaced with hex, string, unicode, or ucc.
The symbol name should be defined in posix/*.cm files.
string: raw code in string.
hex: raw code in hex.
unicode: Symbol name in Unicode.
ucc: Unicode code point in hex.
-->
<!-- These don't have a special Euro sign so just use Eu for it -->
<translation encoding="ISO8859-1" cldr="EURO SIGN" string="Eu" />
<translation encoding="ISO8859-2" cldr="EURO SIGN" string="Eu" />
<translation encoding="ISO8859-4" cldr="EURO SIGN" string="Eu" />
<translation encoding="ISO8859-13" cldr="EURO SIGN" string="Eu" />
<translation encoding="ISO8859-1" cldr="EURO_SIGN" string="Eu" />
<translation encoding="ISO8859-2" cldr="EURO_SIGN" string="Eu" />
<translation encoding="ISO8859-4" cldr="EURO_SIGN" string="Eu" />
<translation encoding="ISO8859-13" cldr="EURO_SIGN" string="Eu" />
<!-- Minus and dashes -->
<translation encoding="ISO8859-1 ISO8859-2 ISO8859-4 ISO8859-13 ISO8859-15"
cldr="MINUS SIGN" unicode="HYPHEN-MINUS" />
cldr="MINUS_SIGN" unicode="HYPHEN-MINUS" />
<translation encoding="ISO8859-2"
cldr="EN DASH" unicode="HYPHEN-MINUS" />
cldr="EN_DASH" unicode="HYPHEN-MINUS" />
<!-- Got these from http://www.decodeunicode.org/en/u+0400.
Where possible use the international or ISO translation!
-->
<translation encoding="ISO8859-2" ucc="0408"
cldr="CYRILLIC CAPITAL LETTER JE"
unicode="LATIN CAPITAL LETTER J" />
cldr="CYRILLIC_CAPITAL_LETTER_JE"
unicode="LATIN_CAPITAL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="0458"
cldr="CYRILLIC SMALL LETTER JE" unicode="LATIN SMALL LETTER J" />
cldr="CYRILLIC_SMALL_LETTER_JE" unicode="LATIN_SMALL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="0409"
cldr="CYRILLIC CAPITAL LETTER LJE" string="lj" />
cldr="CYRILLIC_CAPITAL_LETTER_LJE" string="lj" />
<translation encoding="ISO8859-2" ucc="0459"
cldr="CYRILLIC SMALL LETTER LJE" string="lj" />
cldr="CYRILLIC_SMALL_LETTER_LJE" string="lj" />
<translation encoding="ISO8859-2" ucc="0410"
cldr="CYRILLIC CAPITAL LETTER A" unicode="LATIN CAPITAL LETTER A" />
cldr="CYRILLIC_CAPITAL_LETTER_A" unicode="LATIN_CAPITAL_LETTER_A" />
<translation encoding="ISO8859-2" ucc="0430"
cldr="CYRILLIC SMALL LETTER A" unicode="LATIN SMALL LETTER A" />
cldr="CYRILLIC_SMALL_LETTER_A" unicode="LATIN_SMALL_LETTER_A" />
<translation encoding="ISO8859-2" ucc="0411"
cldr="CYRILLIC CAPITAL LETTER BE"
unicode="LATIN CAPITAL LETTER B" />
cldr="CYRILLIC_CAPITAL_LETTER_BE"
unicode="LATIN_CAPITAL_LETTER_B" />
<translation encoding="ISO8859-2" ucc="0431"
cldr="CYRILLIC SMALL LETTER BE" unicode="LATIN SMALL LETTER B" />
cldr="CYRILLIC_SMALL_LETTER_BE" unicode="LATIN_SMALL_LETTER_B" />
<translation encoding="ISO8859-2" ucc="0412"
cldr="CYRILLIC CAPITAL LETTER VE"
unicode="LATIN CAPITAL LETTER B" />
cldr="CYRILLIC_CAPITAL_LETTER_VE"
unicode="LATIN_CAPITAL_LETTER_B" />
<translation encoding="ISO8859-2" ucc="0432"
cldr="CYRILLIC SMALL LETTER VE" unicode="LATIN SMALL LETTER B" />
cldr="CYRILLIC_SMALL_LETTER_VE" unicode="LATIN_SMALL_LETTER_B" />
<translation encoding="ISO8859-2" ucc="0413"
cldr="CYRILLIC CAPITAL LETTER GHE"
unicode="LATIN CAPITAL LETTER G" />
cldr="CYRILLIC_CAPITAL_LETTER_GHE"
unicode="LATIN_CAPITAL_LETTER_G" />
<translation encoding="ISO8859-2" ucc="0433"
cldr="CYRILLIC SMALL LETTER GHE" unicode="LATIN SMALL LETTER G" />
cldr="CYRILLIC_SMALL_LETTER_GHE" unicode="LATIN_SMALL_LETTER_G" />
<translation encoding="ISO8859-2" ucc="0414"
cldr="CYRILLIC CAPITAL LETTER DE" string="D" />
cldr="CYRILLIC_CAPITAL_LETTER_DE" string="D" />
<translation encoding="ISO8859-2" ucc="0434"
cldr="CYRILLIC SMALL LETTER DE" string="d" />
cldr="CYRILLIC_SMALL_LETTER_DE" string="d" />
<translation encoding="ISO8859-2" ucc="0415"
cldr="CYRILLIC CAPITAL LETTER IE"
unicode="LATIN CAPITAL LETTER E" />
cldr="CYRILLIC_CAPITAL_LETTER_IE"
unicode="LATIN_CAPITAL_LETTER_E" />
<translation encoding="ISO8859-2" ucc="0435"
cldr="CYRILLIC SMALL LETTER IE" unicode="LATIN SMALL LETTER E" />
cldr="CYRILLIC_SMALL_LETTER_IE" unicode="LATIN_SMALL_LETTER_E" />
<translation encoding="ISO8859-2" ucc="0416"
cldr="CYRILLIC CAPITAL LETTER ZHE" string="ZH" />
cldr="CYRILLIC_CAPITAL_LETTER_ZHE" string="ZH" />
<translation encoding="ISO8859-2" ucc="0436"
cldr="CYRILLIC SMALL LETTER ZHE" string="zh" />
cldr="CYRILLIC_SMALL_LETTER_ZHE" string="zh" />
<translation encoding="ISO8859-2" ucc="0417"
cldr="CYRILLIC CAPITAL LETTER ZE" string="z" />
cldr="CYRILLIC_CAPITAL_LETTER_ZE" string="z" />
<translation encoding="ISO8859-2" ucc="0437"
cldr="CYRILLIC SMALL LETTER ZE" string="z" />
cldr="CYRILLIC_SMALL_LETTER_ZE" string="z" />
<translation encoding="ISO8859-2" ucc="0418"
cldr="CYRILLIC CAPITAL LETTER I" unicode="LATIN CAPITAL LETTER J" />
cldr="CYRILLIC_CAPITAL_LETTER_I" unicode="LATIN_CAPITAL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="0438"
cldr="CYRILLIC SMALL LETTER I" unicode="LATIN CAPITAL LETTER J" />
cldr="CYRILLIC_SMALL_LETTER_I" unicode="LATIN_CAPITAL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="0419"
cldr="CYRILLIC CAPITAL LETTER I" unicode="LATIN SMALL LETTER J" />
cldr="CYRILLIC_CAPITAL_LETTER_I" unicode="LATIN_SMALL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="0439"
cldr="CYRILLIC SMALL LETTER I" unicode="LATIN SMALL LETTER J" />
cldr="CYRILLIC_SMALL_LETTER_I" unicode="LATIN_SMALL_LETTER_J" />
<translation encoding="ISO8859-2" ucc="041A"
cldr="CYRILLIC CAPITAL LETTER KA"
unicode="LATIN CAPITAL LETTER K" />
cldr="CYRILLIC_CAPITAL_LETTER_KA"
unicode="LATIN_CAPITAL_LETTER_K" />
<translation encoding="ISO8859-2" ucc="043A"
cldr="CYRILLIC SMALL LETTER KA" unicode="LATIN SMALL LETTER K" />
cldr="CYRILLIC_SMALL_LETTER_KA" unicode="LATIN_SMALL_LETTER_K" />
<translation encoding="ISO8859-2" ucc="041B"
cldr="CYRILLIC CAPITAL LETTER EL"
unicode="LATIN CAPITAL LETTER L" />
cldr="CYRILLIC_CAPITAL_LETTER_EL"
unicode="LATIN_CAPITAL_LETTER_L" />
<translation encoding="ISO8859-2" ucc="043B"
cldr="CYRILLIC SMALL LETTER EL" unicode="LATIN SMALL LETTER L" />
cldr="CYRILLIC_SMALL_LETTER_EL" unicode="LATIN_SMALL_LETTER_L" />
<translation encoding="ISO8859-2" ucc="041C"
cldr="CYRILLIC CAPITAL LETTER EM"
unicode="LATIN CAPITAL LETTER M" />
cldr="CYRILLIC_CAPITAL_LETTER_EM"
unicode="LATIN_CAPITAL_LETTER_M" />
<translation encoding="ISO8859-2" ucc="043C"
cldr="CYRILLIC SMALL LETTER EM" unicode="LATIN SMALL LETTER M" />
cldr="CYRILLIC_SMALL_LETTER_EM" unicode="LATIN_SMALL_LETTER_M" />
<translation encoding="ISO8859-2" ucc="041D"
cldr="CYRILLIC CAPITAL LETTER EN"
unicode="LATIN CAPITAL LETTER H" />
cldr="CYRILLIC_CAPITAL_LETTER_EN"
unicode="LATIN_CAPITAL_LETTER_H" />
<translation encoding="ISO8859-2" ucc="043D"
cldr="CYRILLIC SMALL LETTER EN" unicode="LATIN SMALL LETTER H" />
cldr="CYRILLIC_SMALL_LETTER_EN" unicode="LATIN_SMALL_LETTER_H" />
<translation encoding="ISO8859-2" ucc="041E"
cldr="CYRILLIC CAPITAL LETTER O" unicode="LATIN CAPITAL LETTER O" />
cldr="CYRILLIC_CAPITAL_LETTER_O" unicode="LATIN_CAPITAL_LETTER_O" />
<translation encoding="ISO8859-2" ucc="043E"
cldr="CYRILLIC SMALL LETTER O" unicode="LATIN SMALL LETTER O" />
cldr="CYRILLIC_SMALL_LETTER_O" unicode="LATIN_SMALL_LETTER_O" />
<translation encoding="ISO8859-2" ucc="041F"
cldr="CYRILLIC CAPITAL LETTER PE"
unicode="LATIN CAPITAL LETTER P" />
cldr="CYRILLIC_CAPITAL_LETTER_PE"
unicode="LATIN_CAPITAL_LETTER_P" />
<translation encoding="ISO8859-2" ucc="043F"
cldr="CYRILLIC SMALL LETTER PE" unicode="LATIN SMALL LETTER P" />
cldr="CYRILLIC_SMALL_LETTER_PE" unicode="LATIN_SMALL_LETTER_P" />
<translation encoding="ISO8859-2" ucc="0420"
cldr="CYRILLIC CAPITAL LETTER ER"
unicode="LATIN CAPITAL LETTER R" />
cldr="CYRILLIC_CAPITAL_LETTER_ER"
unicode="LATIN_CAPITAL_LETTER_R" />
<translation encoding="ISO8859-2" ucc="0440"
cldr="CYRILLIC SMALL LETTER ER" unicode="LATIN SMALL LETTER R" />
cldr="CYRILLIC_SMALL_LETTER_ER" unicode="LATIN_SMALL_LETTER_R" />
<translation encoding="ISO8859-2" ucc="0421"
cldr="CYRILLIC CAPITAL LETTER ES"
unicode="LATIN CAPITAL LETTER C" />
cldr="CYRILLIC_CAPITAL_LETTER_ES"
unicode="LATIN_CAPITAL_LETTER_C" />
<translation encoding="ISO8859-2" ucc="0441"
cldr="CYRILLIC SMALL LETTER ES" unicode="LATIN SMALL LETTER C" />
cldr="CYRILLIC_SMALL_LETTER_ES" unicode="LATIN_SMALL_LETTER_C" />
<translation encoding="ISO8859-2" ucc="0422"
cldr="CYRILLIC CAPITAL LETTER TE"
unicode="LATIN CAPITAL LETTER T" />
cldr="CYRILLIC_CAPITAL_LETTER_TE"
unicode="LATIN_CAPITAL_LETTER_T" />
<translation encoding="ISO8859-2" ucc="0442"
cldr="CYRILLIC SMALL LETTER TE" unicode="LATIN SMALL LETTER T" />
cldr="CYRILLIC_SMALL_LETTER_TE" unicode="LATIN_SMALL_LETTER_T" />
<translation encoding="ISO8859-2" ucc="0423"
cldr="CYRILLIC CAPITAL LETTER U" unicode="LATIN CAPITAL LETTER U" />
cldr="CYRILLIC_CAPITAL_LETTER_U" unicode="LATIN_CAPITAL_LETTER_U" />
<translation encoding="ISO8859-2" ucc="0443"
cldr="CYRILLIC SMALL LETTER U" unicode="LATIN SMALL LETTER U" />
cldr="CYRILLIC_SMALL_LETTER_U" unicode="LATIN_SMALL_LETTER_U" />
<translation encoding="ISO8859-2" ucc="0424"
cldr="CYRILLIC CAPITAL LETTER EF"
unicode="LATIN CAPITAL LETTER F" />
cldr="CYRILLIC_CAPITAL_LETTER_EF"
unicode="LATIN_CAPITAL_LETTER_F" />
<translation encoding="ISO8859-2" ucc="0444"
cldr="CYRILLIC SMALL LETTER EF" unicode="LATIN SMALL LETTER F" />
cldr="CYRILLIC_SMALL_LETTER_EF" unicode="LATIN_SMALL_LETTER_F" />
<translation encoding="ISO8859-2" ucc="0425"
cldr="CYRILLIC CAPITAL LETTER HA"
unicode="LATIN CAPITAL LETTER H" />
cldr="CYRILLIC_CAPITAL_LETTER_HA"
unicode="LATIN_CAPITAL_LETTER_H" />
<translation encoding="ISO8859-2" ucc="0445"
cldr="CYRILLIC SMALL LETTER HA" unicode="LATIN SMALL LETTER H" />
cldr="CYRILLIC_SMALL_LETTER_HA" unicode="LATIN_SMALL_LETTER_H" />
<translation encoding="ISO8859-2" ucc="0426"
cldr="CYRILLIC CAPITAL LETTER TSE"
unicode="LATIN CAPITAL LETTER C" />
cldr="CYRILLIC_CAPITAL_LETTER_TSE"
unicode="LATIN_CAPITAL_LETTER_C" />
<translation encoding="ISO8859-2" ucc="0446"
cldr="CYRILLIC SMALL LETTER TSE" unicode="LATIN SMALL LETTER C" />
cldr="CYRILLIC_SMALL_LETTER_TSE" unicode="LATIN_SMALL_LETTER_C" />
<translation encoding="ISO8859-2" ucc="0427"
cldr="CYRILLIC CAPITAL LETTER CHE"
unicode="LATIN CAPITAL LETTER C WITH CARON" />
cldr="CYRILLIC_CAPITAL_LETTER_CHE"
unicode="LATIN_CAPITAL_LETTER_C_WITH_CARON" />
<translation encoding="ISO8859-2" ucc="0447"
cldr="CYRILLIC SMALL LETTER CHE"
unicode="LATIN SMALL LETTER C WITH CARON" />
cldr="CYRILLIC_SMALL_LETTER_CHE"
unicode="LATIN_SMALL_LETTER_C_WITH_CARON" />
<translation encoding="ISO8859-2" ucc="0428"
cldr="CYRILLIC CAPITAL LETTER SHA"
unicode="LATIN CAPITAL LETTER S WITH CARON" />
cldr="CYRILLIC_CAPITAL_LETTER_SHA"
unicode="LATIN_CAPITAL_LETTER_S_WITH_CARON" />
<translation encoding="ISO8859-2" ucc="0448"
cldr="CYRILLIC SMALL LETTER SHA"
unicode="LATIN SMALL LETTER S WITH CARON" />
cldr="CYRILLIC_SMALL_LETTER_SHA"
unicode="LATIN_SMALL_LETTER_S_WITH_CARON" />
<translation encoding="ISO8859-2" ucc="0429"
cldr="CYRILLIC CAPITAL LETTER SHCHA"
unicode="LATIN CAPITAL LETTER S WITH CIRCUMFLEX" />
cldr="CYRILLIC_CAPITAL_LETTER_SHCHA"
unicode="LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX" />
<translation encoding="ISO8859-2" ucc="0449"
cldr="CYRILLIC SMALL LETTER SHCHA"
unicode="LATIN SMALL LETTER S WITH CIRCUMFLEX" />
cldr="CYRILLIC_SMALL_LETTER_SHCHA"
unicode="LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX" />
<translation encoding="ISO8859-2" ucc="042A"
cldr="?CYRILLIC CAPITAL LETTER HARD SIGN" unicode="?" />
cldr="?CYRILLIC_CAPITAL_LETTER_HARD_SIGN" unicode="?" />
<translation encoding="ISO8859-2" ucc="044A"
cldr="?CYRILLIC SMALL LETTER HARD SIGN" unicode="?" />
cldr="?CYRILLIC_SMALL_LETTER_HARD_SIGN" unicode="?" />
<translation encoding="ISO8859-2" ucc="042B"
cldr="?CYRILLIC CAPITAL LETTER YERU" unicode="?" />
cldr="?CYRILLIC_CAPITAL_LETTER_YERU" unicode="?" />
<translation encoding="ISO8859-2" ucc="044B"
cldr="?CYRILLIC SMALL LETTER YERU" unicode="?" />
cldr="?CYRILLIC_SMALL_LETTER_YERU" unicode="?" />
<translation encoding="ISO8859-2" ucc="042C"
cldr="?CYRILLIC CAPITAL LETTER SOFT SIGN" unicode="?" />
cldr="?CYRILLIC_CAPITAL_LETTER_SOFT_SIGN" unicode="?" />
<translation encoding="ISO8859-2" ucc="044C"
cldr="?CYRILLIC SMALL LETTER SOFT SIGN" unicode="?" />
cldr="?CYRILLIC_SMALL_LETTER_SOFT_SIGN" unicode="?" />
<translation encoding="ISO8859-2" ucc="042D"
cldr="CYRILLIC CAPITAL LETTER E"
unicode="LATIN CAPITAL LETTER E WITH GRAVE" />
cldr="CYRILLIC_CAPITAL_LETTER_E"
unicode="LATIN_CAPITAL_LETTER_E_WITH_GRAVE" />
<translation encoding="ISO8859-2" ucc="044D"
cldr="CYRILLIC SMALL LETTER E"
unicode="LATIN SMALL LETTER E WITH GRAVE" />
cldr="CYRILLIC_SMALL_LETTER_E"
unicode="LATIN_SMALL_LETTER_E_WITH_GRAVE" />
<translation encoding="ISO8859-2" ucc="042E"
cldr="?CYRILLIC CAPITAL LETTER YU" unicode="?" />
cldr="?CYRILLIC_CAPITAL_LETTER_YU" unicode="?" />
<translation encoding="ISO8859-2" ucc="044E"
cldr="?CYRILLIC SMALL LETTER YU" unicode="?" />
cldr="?CYRILLIC_SMALL_LETTER_YU" unicode="?" />
<translation encoding="ISO8859-2" ucc="042F"
cldr="CYRILLIC CAPITAL LETTER YA"
unicode="LATIN CAPITAL LETTER A WITH CIRCUMFLEX" />
cldr="CYRILLIC_CAPITAL_LETTER_YA"
unicode="LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX" />
<translation encoding="ISO8859-2" ucc="044F"
cldr="CYRILLIC SMALL LETTER YA"
unicode="LATIN SMALL LETTER A WITH CIRCUMFLEX" />
cldr="CYRILLIC_SMALL_LETTER_YA"
unicode="LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX" />
<translation encoding="ISO8859-2"
cldr="LATIN SMALL LETTER T WITH COMMA BELOW"
unicode="LATIN SMALL LETTER T" />
cldr="LATIN_SMALL_LETTER_T_WITH_COMMA_BELOW"
unicode="LATIN_SMALL_LETTER_T" />
<translation encoding="ISO8859-5"
cldr="MODIFIER LETTER APOSTROPHE" unicode="APOSTROPHE" />
cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" />
<translation encoding="ISO8859-5"
cldr="LATIN SMALL LETTER C WITH CARON"
unicode="LATIN SMALL LETTER C" />
cldr="LATIN_SMALL_LETTER_C_WITH_CARON"
unicode="LATIN_SMALL_LETTER_C" />
<translation encoding="KOI8-U"
cldr="MODIFIER LETTER APOSTROPHE" unicode="APOSTROPHE" />
cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" />
<translation encoding="CP1251"
cldr="MODIFIER LETTER APOSTROPHE" unicode="APOSTROPHE" />
cldr="MODIFIER_LETTER_APOSTROPHE" unicode="APOSTROPHE" />
<!-- Copied from the original FreeBSD src/share/monetdef -->
<translation encoding="CP1251" cldr="HRYVNIA SIGN" hex="E3F0ED" />
<translation encoding="ISO8859-5" cldr="HRYVNIA SIGN" hex="D3E0DD" />
<translation encoding="KOI8-U" cldr="HRYVNIA SIGN" hex="C7D2CE" />
<translation encoding="CP866" cldr="RUBLE SIGN" hex="E0E3A1" />
<translation encoding="ISO8859-5" cldr="RUBLE SIGN" hex="E0E3D1" />
<translation encoding="CP1251" cldr="RUBLE SIGN" hex="E0E3D1" />
<translation encoding="KOI8-R" cldr="RUBLE SIGN" hex="D2D5C2" />
<translation encoding="CP1251" cldr="HRYVNIA_SIGN" hex="E3F0ED" />
<translation encoding="ISO8859-5" cldr="HRYVNIA_SIGN" hex="D3E0DD" />
<translation encoding="KOI8-U" cldr="HRYVNIA_SIGN" hex="C7D2CE" />
<translation encoding="CP866" cldr="RUBLE_SIGN" hex="E0E3A1" />
<translation encoding="ISO8859-5" cldr="RUBLE_SIGN" hex="E0E3D1" />
<translation encoding="CP1251" cldr="RUBLE_SIGN" hex="E0E3D1" />
<translation encoding="KOI8-R" cldr="RUBLE_SIGN" hex="D2D5C2" />
<!-- These don't have a special Kow sign so just use KRW for it -->
<translation encoding="CP949" cldr="WON SIGN" hex="5C" />
<translation encoding="eucKR" cldr="WON SIGN" hex="5C" />
<translation encoding="CP949" cldr="WON_SIGN" hex="5C" />
<translation encoding="eucKR" cldr="WON_SIGN" hex="5C" />
<!-- Asian characters -->
<translation encoding="GB2312 eucCN" cldr="C"
unicode="FULLWIDTH LATIN CAPITAL LETTER C" />
unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_C" />
<translation encoding="Big5" cldr="D"
unicode="FULLWIDTH LATIN CAPITAL LETTER D" />
unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_D" />
<translation encoding="GB2312 eucCN Big5" cldr="N"
unicode="FULLWIDTH LATIN CAPITAL LETTER N" />
unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_N" />
<translation encoding="Big5" cldr="T"
unicode="FULLWIDTH LATIN CAPITAL LETTER T" />
unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_T" />
<translation encoding="Big5" cldr="W"
unicode="FULLWIDTH LATIN CAPITAL LETTER W" />
unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_W" />
<translation encoding="GB2312 eucCN" cldr="Y"
unicode="FULLWIDTH LATIN CAPITAL LETTER Y" />
unicode="FULLWIDTH_LATIN_CAPITAL_LETTER_Y" />
<translation encoding="GB2312 Big5 eucCN" cldr="one"
unicode="FULLWIDTH DIGIT ONE" />
unicode="FULLWIDTH_DIGIT_ONE" />
<translation encoding="GB2312 Big5 eucCN" cldr="two"
unicode="FULLWIDTH DIGIT TWO" />
unicode="FULLWIDTH_DIGIT_TWO" />
<translation encoding="GB2312 Big5 eucCN" cldr="three"
unicode="FULLWIDTH DIGIT THREE" />
unicode="FULLWIDTH_DIGIT_THREE" />
<translation encoding="GB2312 Big5 eucCN" cldr="four"
unicode="FULLWIDTH DIGIT FOUR" />
unicode="FULLWIDTH_DIGIT_FOUR" />
<translation encoding="GB2312 Big5 eucCN" cldr="five"
unicode="FULLWIDTH DIGIT FIVE" />
unicode="FULLWIDTH_DIGIT_FIVE" />
<translation encoding="GB2312 Big5 eucCN" cldr="six"
unicode="FULLWIDTH DIGIT SIX" />
unicode="FULLWIDTH_DIGIT_SIX" />
<translation encoding="GB2312 Big5 eucCN" cldr="seven"
unicode="FULLWIDTH DIGIT SEVEN" />
unicode="FULLWIDTH_DIGIT_SEVEN" />
<translation encoding="GB2312 Big5 eucCN" cldr="eight"
unicode="FULLWIDTH DIGIT EIGHT" />
unicode="FULLWIDTH_DIGIT_EIGHT" />
<translation encoding="GB2312 Big5 eucCN" cldr="nine"
unicode="FULLWIDTH DIGIT NINE" />
unicode="FULLWIDTH_DIGIT_NINE" />
<translation encoding="GB2312 Big5 eucCN" cldr="zero"
unicode="FULLWIDTH DIGIT ZERO" />
unicode="FULLWIDTH_DIGIT_ZERO" />
<translation encoding="GB2312 eucCN Big5" cldr="space"
unicode="IDEOGRAPHIC SPACE" />
<translation encoding="GB2312 eucCN Big5" cldr="FULL STOP"
unicode="FULLWIDTH FULL STOP" />
unicode="IDEOGRAPHIC_SPACE" />
<translation encoding="GB2312 eucCN Big5" cldr="FULL_STOP"
unicode="FULLWIDTH_FULL_STOP" />
<translation encoding="GB2312 eucCN Big5" cldr="SOLIDUS"
unicode="FULLWIDTH SOLIDUS" />
unicode="FULLWIDTH_SOLIDUS" />
<translation encoding="GB2312 eucCN Big5" cldr="COMMA"
unicode="FULLWIDTH COMMA" />
unicode="FULLWIDTH_COMMA" />
<translation encoding="GB2312 eucCN Big5" cldr="HYPHEN-MINUS"
unicode="FULLWIDTH HYPHEN-MINUS" />
<translation encoding="Big5" cldr="DOLLAR SIGN"
unicode="FULLWIDTH DOLLAR SIGN" />
unicode="FULLWIDTH_HYPHEN-MINUS" />
<translation encoding="Big5" cldr="DOLLAR_SIGN"
unicode="FULLWIDTH_DOLLAR_SIGN" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E00" ucc="4E00" />
cldr="CJK_UNIFIED_IDEOGRAPH-4E00" ucc="4E00" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E03" ucc="4E03" />
cldr="CJK_UNIFIED_IDEOGRAPH-4E03" ucc="4E03" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E09" ucc="4E09" />
cldr="CJK_UNIFIED_IDEOGRAPH-4E09" ucc="4E09" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E0A" ucc="4E0A" />
cldr="CJK_UNIFIED_IDEOGRAPH-4E0A" ucc="4E0A" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E0B" ucc="4E0B" />
cldr="CJK_UNIFIED_IDEOGRAPH-4E0B" ucc="4E0B" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E0D" ucc="4E0D" />
cldr="CJK_UNIFIED_IDEOGRAPH-4E0D" ucc="4E0D" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E5D" ucc="4E5D" />
cldr="CJK_UNIFIED_IDEOGRAPH-4E5D" ucc="4E5D" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E8C" ucc="4E8C" />
cldr="CJK_UNIFIED_IDEOGRAPH-4E8C" ucc="4E8C" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-4E94" ucc="4E94" />
cldr="CJK_UNIFIED_IDEOGRAPH-4E94" ucc="4E94" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-516B" ucc="516B" />
cldr="CJK_UNIFIED_IDEOGRAPH-516B" ucc="516B" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-516D" ucc="516D" />
cldr="CJK_UNIFIED_IDEOGRAPH-516D" ucc="516D" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-5206" ucc="5206" />
cldr="CJK_UNIFIED_IDEOGRAPH-5206" ucc="5206" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-524D" ucc="524D" />
cldr="CJK_UNIFIED_IDEOGRAPH-524D" ucc="524D" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-5341" ucc="5341" />
cldr="CJK_UNIFIED_IDEOGRAPH-5341" ucc="5341" />
<translation
encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-5348" ucc="5348" />
cldr="CJK_UNIFIED_IDEOGRAPH-5348" ucc="5348" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-5426" ucc="5426" />
cldr="CJK_UNIFIED_IDEOGRAPH-5426" ucc="5426" />
<translation encoding="GB2312 GB18030 GBK eucCN"
cldr="CJK UNIFIED IDEOGRAPH-5468" ucc="5468" />
cldr="CJK_UNIFIED_IDEOGRAPH-5468" ucc="5468" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-56DB" ucc="56DB" />
cldr="CJK_UNIFIED_IDEOGRAPH-56DB" ucc="56DB" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-571F" ucc="571F" />
cldr="CJK_UNIFIED_IDEOGRAPH-571F" ucc="571F" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-5B9A" ucc="5B9A" />
cldr="CJK_UNIFIED_IDEOGRAPH-5B9A" ucc="5B9A" />
<translation
encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-5E74" ucc="5E74" />
cldr="CJK_UNIFIED_IDEOGRAPH-5E74" ucc="5E74" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-5F8C" ucc="5F8C" />
cldr="CJK_UNIFIED_IDEOGRAPH-5F8C" ucc="5F8C" />
<translation
encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-65E5" ucc="65E5" />
cldr="CJK_UNIFIED_IDEOGRAPH-65E5" ucc="65E5" />
<translation encoding="GB2312 GB18030 GBK eucCN"
cldr="CJK UNIFIED IDEOGRAPH-65F6" ucc="65F6" />
cldr="CJK_UNIFIED_IDEOGRAPH-65F6" ucc="65F6" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-661F" ucc="661F" />
cldr="CJK_UNIFIED_IDEOGRAPH-661F" ucc="661F" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-662F" ucc="662F" />
cldr="CJK_UNIFIED_IDEOGRAPH-662F" ucc="662F" />
<translation encoding="Big5 "
cldr="CJK UNIFIED IDEOGRAPH-6642" ucc="6642" />
cldr="CJK_UNIFIED_IDEOGRAPH-6642" ucc="6642" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-66DC" ucc="66DC" />
cldr="CJK_UNIFIED_IDEOGRAPH-66DC" ucc="66DC" />
<translation
encoding="GB2312 GB18030 GBK Big5 eucCN eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-6708" ucc="6708" />
cldr="CJK_UNIFIED_IDEOGRAPH-6708" ucc="6708" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-671F" ucc="671F" />
cldr="CJK_UNIFIED_IDEOGRAPH-671F" ucc="671F" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-6728" ucc="6728" />
cldr="CJK_UNIFIED_IDEOGRAPH-6728" ucc="6728" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-6C34" ucc="6C34" />
cldr="CJK_UNIFIED_IDEOGRAPH-6C34" ucc="6C34" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-706B" ucc="706B" />
cldr="CJK_UNIFIED_IDEOGRAPH-706B" ucc="706B" />
<translation encoding="GB2312 GB18030 GBK eucCN"
cldr="CJK UNIFIED IDEOGRAPH-786E" ucc="786E" />
cldr="CJK_UNIFIED_IDEOGRAPH-786E" ucc="786E" />
<translation encoding="Big5 "
cldr="CJK UNIFIED IDEOGRAPH-78BA" ucc="78BA" />
cldr="CJK_UNIFIED_IDEOGRAPH-78BA" ucc="78BA" />
<translation encoding="GB2312 GB18030 GBK Big5 eucCN"
cldr="CJK UNIFIED IDEOGRAPH-79D2" ucc="79D2" />
cldr="CJK_UNIFIED_IDEOGRAPH-79D2" ucc="79D2" />
<translation encoding="Big5 "
cldr="CJK UNIFIED IDEOGRAPH-9031" ucc="9031" />
cldr="CJK_UNIFIED_IDEOGRAPH-9031" ucc="9031" />
<translation encoding="eucJP SJIS"
cldr="CJK UNIFIED IDEOGRAPH-91D1" ucc="91D1" />
cldr="CJK_UNIFIED_IDEOGRAPH-91D1" ucc="91D1" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE GEUM" ucc="AE08" />
cldr="HANGUL_SYLLABLE_GEUM" ucc="AE08" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE NYEON" ucc="B144" />
cldr="HANGUL_SYLLABLE_NYEON" ucc="B144" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE NI" ucc="B2C8" />
cldr="HANGUL_SYLLABLE_NI" ucc="B2C8" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE MOG" ucc="BAA9" />
cldr="HANGUL_SYLLABLE_MOG" ucc="BAA9" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE BUN" ucc="BD84" />
cldr="HANGUL_SYLLABLE_BUN" ucc="BD84" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE SU" ucc="C218" />
cldr="HANGUL_SYLLABLE_SU" ucc="C218" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE SI" ucc="C2DC" />
cldr="HANGUL_SYLLABLE_SI" ucc="C2DC" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE A" ucc="C544" />
cldr="HANGUL_SYLLABLE_A" ucc="C544" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE YE" ucc="C608" />
cldr="HANGUL_SYLLABLE_YE" ucc="C608" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE O" ucc="C624" />
cldr="HANGUL_SYLLABLE_O" ucc="C624" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE YO" ucc="C694" />
cldr="HANGUL_SYLLABLE_YO" ucc="C694" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE WEOL" ucc="C6D4" />
cldr="HANGUL_SYLLABLE_WEOL" ucc="C6D4" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE IL" ucc="C77C" />
cldr="HANGUL_SYLLABLE_IL" ucc="C77C" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE JEON" ucc="C804" />
cldr="HANGUL_SYLLABLE_JEON" ucc="C804" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE CO" ucc="CD08" />
cldr="HANGUL_SYLLABLE_CO" ucc="CD08" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE TO" ucc="D1A0" />
cldr="HANGUL_SYLLABLE_TO" ucc="D1A0" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE HWA" ucc="D654" />
cldr="HANGUL_SYLLABLE_HWA" ucc="D654" />
<translation encoding="eucKR"
cldr="HANGUL SYLLABLE HU" ucc="D6C4" />
cldr="HANGUL_SYLLABLE_HU" ucc="D6C4" />
<translation encoding="ARMSCII-8"
cldr="ONE DOT LEADER" unicode="FULL STOP" />
cldr="ONE_DOT_LEADER" unicode="FULL_STOP" />
<translation encoding="US-ASCII" cldr="POUND SIGN" string="GBP" />
<translation encoding="US-ASCII" cldr="POUND_SIGN" string="GBP" />
<translation encoding="US-ASCII"
cldr="NO-BREAK SPACE" unicode="SPACE" />
cldr="NO-BREAK_SPACE" unicode="SPACE" />
<translation encoding="ISO8859-1 ISO8859-15"
cldr="NARROW NO-BREAK SPACE" unicode="NO-BREAK SPACE" />
cldr="NARROW_NO-BREAK_SPACE" unicode="NO-BREAK_SPACE" />
<!-- punctuation and currency -->
<translation encoding="ISO8859-1 ISO8859-15"
cldr="RIGHT SINGLE QUOTATION MARK" unicode="APOSTROPHE" />
cldr="RIGHT_SINGLE_QUOTATION_MARK" unicode="APOSTROPHE" />
<translation encoding="ISCII-DEV" cldr="INDIAN RUPEE SIGN" hex="FC" />
<translation encoding="ISO8859-1" cldr="PESO SIGN" hex="A4" />
<translation encoding="ISO8859-1" cldr="COLON SIGN" hex="A4" />
<translation encoding="ARMSCII-8" cldr="ARMENIAN DRAM SIGN"
<translation encoding="ISCII-DEV" cldr="INDIAN_RUPEE_SIGN" hex="FC" />
<translation encoding="ISO8859-1" cldr="PESO_SIGN" hex="A4" />
<translation encoding="ISO8859-1" cldr="COLON_SIGN" hex="A4" />
<translation encoding="ARMSCII-8" cldr="ARMENIAN_DRAM_SIGN"
hex="B9F12E" />
<translation encoding="ISO8859-9" cldr="TURKISH LIRA SIGN"
<translation encoding="ISO8859-9" cldr="TURKISH_LIRA_SIGN"
string="TL" />
</translations>

View File

@ -4,6 +4,7 @@
#
# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
# Copyright 2015 John Marino <draco@marino.st>
# Copyright 2020 Hiroki Sato <hrs@FreeBSD.org>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
@ -38,7 +39,6 @@ use Getopt::Long;
use Digest::SHA qw(sha1_hex);
require "charmaps.pm";
if ($#ARGV < 2) {
print "Usage: $0 --unidir=<unidir> --etc=<etcdir> --type=<type>\n";
exit(1);
@ -69,10 +69,11 @@ my %encodings = ();
my %alternativemonths = ();
get_languages();
my %utf8map = ();
my %utf8aliases = ();
get_unidata($UNIDIR);
get_utf8map("$UNIDIR/posix/$DEFENCODING.cm");
my %utfmap = ();
$utfmap{'UTF-8'} = {};
$utfmap{'UTF-32'} = {};
get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'});
get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'});
get_encodings("$ETCDIR/charmaps");
my %keys = ();
@ -334,25 +335,8 @@ sub callback_abmon {
############################
sub get_unidata {
my $directory = shift;
open(FIN, "$directory/UnicodeData.txt")
or die("Cannot open $directory/UnicodeData.txt");;
my @lines = <FIN>;
chomp(@lines);
close(FIN);
foreach my $l (@lines) {
my @a = split(/;/, $l);
$ucd{code2name}{"$a[0]"} = $a[1]; # Unicode name
$ucd{name2code}{"$a[1]"} = $a[0]; # Unicode code
}
}
sub get_utf8map {
my $file = shift;
sub get_utfmap {
my ($file, $db) = @_;
open(FIN, $file);
my @lines = <FIN>;
@ -363,7 +347,7 @@ sub get_utf8map {
my $prev_v = "";
my $incharmap = 0;
foreach my $l (@lines) {
$l =~ s/\r//;
chomp($l);
next if ($l =~ /^\#/);
next if ($l eq "");
@ -378,17 +362,28 @@ sub get_utf8map {
$l =~ /^<([^\s]+)>\s+(.*)/;
my $k = $1;
my $v = $2;
$k =~ s/_/ /g; # unicode char string
$v =~ s/\\x//g; # UTF-8 char code
$utf8map{$k} = $v;
$db->{$k} = $v;
# print STDERR "UTF $k = $v\n";
$utf8aliases{$k} = $prev_k if ($prev_v eq $v);
# XXX: no longer needed
# $db_alias->{$k} = $prev_k if ($prev_v eq $v);
$prev_v = $v;
$prev_k = $k;
}
}
sub resolve_enc_addition {
my $ret = '';
foreach my $t (split(/\+/, $_[0])) {
$t =~ s/^0[xX]//;
$ret .= $t;
}
return $ret;
}
sub get_encodings {
my $dir = shift;
foreach my $e (sort(keys(%encodings))) {
@ -403,14 +398,20 @@ sub get_encodings {
chomp(@lines);
foreach my $l (@lines) {
$l =~ s/\r//;
next if ($l =~ /^\#/);
next if ($l eq "");
my @a = split(" ", $l);
next if ($#a < 1);
$a[0] =~ s/^0[xX]//; # local char code
$a[1] =~ s/^0[xX]//; # unicode char code
$convertors{$e}{uc($a[1])} = uc($a[0]);
next if ($a[0] =~ /^\#/ or $a[1] =~ /^\#/);
next if ($a[0] eq '' or $a[1] eq '');
$a[0] = resolve_enc_addition($a[0]); # local
$a[1] = resolve_enc_addition($a[1]); # UTF-32
my $u32 = sprintf("%08X", hex($a[1]));
# print STDERR "$a[1] => $u32\n";
# Use UTF-32 as the indices.
$convertors{$e}{$u32} = uc($a[0]);
}
}
}
@ -565,8 +566,75 @@ EOF
foreach my $enc (sort keys(%{$languages{$l}{$f}{data}{$c}})) {
next if ($enc eq $DEFENCODING);
copy ("$TYPE.draft/$actfile.$DEFENCODING.src",
"$TYPE.draft/$actfile.$enc.src");
open FIN, "<$TYPE.draft/$actfile.$DEFENCODING.src";
open FOUT, ">$TYPE.draft/$actfile.$enc.src";
my $order_start = 0;
my $print_p = 0;
#
# %c_elem: collation elements
#
# undef: not defined
# 1: defined
# 2: invalid in this encoding
#
my %c_elem = ();
while (<FIN>) { # XXX: this loop should be refactored.
chomp;
$print_p = 1;
if ($order_start) {
$order_start = 0 if (m/^order_end/);
if (m/^<([^>]+)>/) {
if (not defined $c_elem{$1}) {
# print STDERR "$1:\n";
my $u32 = $utfmap{'UTF-32'}->{$1};
die "order, $1\n" if (not defined $u32);
# print STDERR "u32 for $1 = $u32\n";
if (not defined $convertors{$enc}{$u32}) {
# print STDERR "$1 - $u32 not defined in $enc\n";
$print_p = 0;
}
} elsif ($c_elem{$1} == 2) {
# print STDERR "$1 is marked as invalid in $enc\n";
$print_p = 0;
}
}
} elsif (m/^collating-element/) {
my ($elem, $l);
if (m/<([^>]+)> from (.+)/) {
($elem, $l) = ($1, $2);
}
# print STDERR "$elem: enter ($print_p, $l,)\n";
while ($print_p and
defined $l and
$l =~ m/<([^>]+)>/g) {
# print STDERR "$elem: $1\n";
my $u32 = $utfmap{'UTF-32'}->{$1};
die "collating-element, $1\n" if (not defined $u32);
# print STDERR "u32 for $1 = $u32\n";
if (not $convertors{$enc}{$u32}) {
# print STDERR "$1 - $u32 not defined in $enc\n";
$print_p = 0;
# print STDERR "Mark $elem as invalid\n";
$c_elem{$elem} = 2;
}
}
if ($print_p) {
# print STDERR "Add $elem\n";
$c_elem{$elem} = 1;
}
} elsif (m/^collating-symbol <([^>]+)>/) {
# print STDERR "Add $1\n";
$c_elem{$1} = 1;
} elsif (m/^order_start/) {
$order_start = 1;
# do nothing
}
print FOUT $_, "\n" if ($print_p);
}
close FOUT;
close FIN;
$languages{$l}{$f}{data}{$c}{$enc} = $shex;
$hashtable{$shex}{"${l}_${f}_${c}.$enc"} = 1;
}
@ -626,11 +694,11 @@ sub get_fields {
$continue = ($line =~ /\/$/);
$line =~ s/\/$// if ($continue);
while ($line =~ /_/) {
$line =~
s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
}
die "_ in data - $line" if ($line =~ /_/);
# while ($line =~ /_/) {
# $line =~
# s/\<([^>_]+)_([^>]+)\>/<$1 $2>/;
# }
# die "_ in data - $line" if ($line =~ /_/);
$values{$l}{$f}{$c}{$k} .= $line;
last if (!$continue);
@ -652,56 +720,52 @@ sub decodecldr {
# Conversion to UTF-8 can be done from the Unicode name to
# the UTF-8 character code.
#
$v = $utf8map{$s};
$v = $utfmap{'UTF-8'}->{$s};
die "Cannot convert $s in $e (charmap)" if (!defined $v);
} else {
#
# Conversion to these encodings can be done from the Unicode
# name to Unicode code to the encodings code.
#
my $ucc = undef;
$ucc = $ucd{name2code}{$s} if (defined $ucd{name2code}{$s});
$ucc = $ucd{name2code}{$utf8aliases{$s}}
if (!defined $ucc
&& $utf8aliases{$s}
&& defined $ucd{name2code}{$utf8aliases{$s}});
# hex - hex or string attr
# unicode - unicode attr
# ucc - ucc attr
my $hex = $translations{$e}{$s}{hex};
my $ucc = $utfmap{'UTF-32'}->{$s};
my $ucc_attr = $translations{$e}{$s}{ucc};
my $unicode = $translations{$e}{$s}{unicode};
if (!defined $ucc) {
if (defined $translations{$e}{$s}{hex}) {
$v = $translations{$e}{$s}{hex};
$ucc = 0;
} elsif (defined $translations{$e}{$s}{ucc}) {
$ucc = $translations{$e}{$s}{ucc};
if (defined $hex) { # hex is in local encoding
$v = $hex;
} elsif (defined $unicode) { # unicode is in name
$v = $convertors{$e}{$utfmap{'UTF-32'}->{$unicode}};
} elsif (defined $ucc_attr) { # ucc is in code point
if (defined $ucc) {
# print STDERR "INFO: ucc=$ucc_attr ",
# "overrides $ucc in UTF-32\n";
}
}
die "Cannot convert $s in $e (ucd string)" if (!defined $ucc);
$v = $convertors{$e}{$ucc} if (!defined $v);
$v = $translations{$e}{$s}{hex}
if (!defined $v && defined $translations{$e}{$s}{hex});
if (!defined $v && defined $translations{$e}{$s}{unicode}) {
my $ucn = $translations{$e}{$s}{unicode};
$ucc = $ucd{name2code}{$ucn}
if (defined $ucd{name2code}{$ucn});
$ucc = $ucd{name2code}{$utf8aliases{$ucn}}
if (!defined $ucc
&& defined $ucd{name2code}{$utf8aliases{$ucn}});
# normalize
$ucc_attr = sprintf("%08X", hex($ucc_attr));
# print STDERR "convert $ucc_attr into $e\n";
$v = $convertors{$e}{$ucc_attr};
} elsif (defined $ucc) {
# normalize
$ucc = sprintf("%08X", hex($ucc));
# print STDERR "convert $ucc into $e\n";
$v = $convertors{$e}{$ucc};
}
die "Cannot convert $s in $e (charmap)" if (!defined $v);
die "Cannot convert $s in $e" if (!defined $v);
}
# XXX: length = 8 is not supported yet.
$v =~ s/^[0]+//g;
$v = "0" . $v if (length($v) % 2);
return pack("C", hex($v)) if (length($v) == 2);
return pack("CC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)))
if (length($v) == 4);
return pack("CCC", hex(substr($v, 0, 2)), hex(substr($v, 2, 2)),
hex(substr($v, 4, 2))) if (length($v) == 6);
print STDERR "Cannot convert $e $s\n";
return "length = " . length($v);
die "Cannot convert $s in $e (length = " . length($v) . "\n";
}
sub translate {