locales: only generate unicode locales

This commit is contained in:
Baptiste Daroussin 2021-07-23 16:10:24 +02:00
parent ecff3c30b7
commit fb25fdcaa3
4 changed files with 14 additions and 184 deletions

View File

@ -32,33 +32,9 @@ tools-test:
KNOWN= monetdef numericdef msgdef colldef ctypedef # timedef
TYPES?= ${KNOWN}
COLLATION_SPECIAL?= \
cs_CZ ISO8859-2 \
da_DK ISO8859-1 \
da_DK ISO8859-15 \
hr_HR ISO8859-2 \
hu_HU ISO8859-2 \
nb_NO ISO8859-1 \
nb_NO ISO8859-15 \
sk_SK ISO8859-2 \
sr_Latn_RS ISO8859-2 \
sr_Cyrl_RS ISO8859-5 \
zh_Hans_CN GB2312 \
zh_Hans_CN eucCN \
zh_Hant_TW Big5 \
zh_Hans_CN GB18030 \
zh_Hans_CN GBK \
ja_JP eucJP \
nn_NO ISO8859-15 \
nn_NO ISO8859-1
.for area enc in ${COLLATION_SPECIAL}
COLLATIONS_SPECIAL_ENV+= ${area}.${enc}
.endfor
SETENV= env -i \
PATH="${PATH}" \
TMPDIR="${TMPDIR}" \
COLLATIONS_SPECIAL="${COLLATIONS_SPECIAL_ENV}" \
UNIDIR="${UNIDIR}" \
BASEDIR="${BASEDIR}" \
TOOLSDIR="${TOOLSDIR}" \
@ -89,16 +65,22 @@ diff-${t}:
.endfor
install:
.for t in ${TYPES}
.for t in ${TYPES:Nctypedef}
. if ${KNOWN:M${t}}
install: install-${t}
install-${t}:
cd ${LOCALESRCDIR}/${t} && \
cd ${LOCALESRCDIR}/${t}_unicode && \
rm -f Makefile *.src && \
cd ${.OBJDIR} && \
install -m 644 ${t}/* ${LOCALESRCDIR}/${t}
install -m 644 ${t}/* ${LOCALESRCDIR}/${t}_unicode
. endif
.endfor
install: install-ctypedef
install-ctypedef:
cd ${LOCALESRCDIR}/ctypedef && \
rm -f C.UTF-8.src && \
cd ${.OBJDIR} && \
install -m 644 ctypedef/C.UTF-8.src ${LOCALESRCDIR}/ctypedef
post-install:
.for t in ${TYPES}
@ -121,15 +103,6 @@ build-${t}: ${t}
${SETENV} OUTBASEDIR="${.OBJDIR}/${t}" ${TOOLSDIR}/finalize ${t}
.endfor
static-colldef: colldef
build-colldef: static-colldef
static-colldef:
.for area enc in ${COLLATION_SPECIAL}
awk -f ${TOOLSDIR}/extract-colldef.awk \
posix/${area}.${enc}.src > colldef.draft/${area}.${enc}.src
.endfor
BASE_LOCALES_OF_INTEREST?= \
af_ZA am_ET ar_AE ar_EG ar_JO ar_MA ar_QA ar_SA \
be_BY bg_BG ca_AD ca_ES ca_FR ca_IT \
@ -147,35 +120,14 @@ BASE_LOCALES_OF_INTEREST?= \
th_TH lo_LA bo_IN my_MM pa_Guru_IN ka_GE chr_US \
km_KH shi_Tfng_MA ii_CN vai_Vaii_LR vi_VN
ENCODINGS= Big5 \
CP1251 \
CP866 \
CP949 \
eucCN \
eucJP \
eucKR \
GB18030 \
GB2312 \
GBK \
ISO8859-1 \
ISO8859-13 \
ISO8859-15 \
ISO8859-2 \
ISO8859-5 \
ISO8859-7 \
ISO8859-9 \
KOI8-R \
KOI8-U \
SJIS \
US-ASCII \
UTF-8 \
ENCODINGS= UTF-8 \
UTF-32
# CLDR files
CLDRFILES_CORE= https://unicode.org/Public/cldr/35/core.zip
CLDRFILES_KEY= https://unicode.org/Public/cldr/35/keyboards.zip
CLDRFILES_TOOLS=https://unicode.org/Public/cldr/35/tools.zip
CLDRFILES_UCD= http://www.unicode.org/Public/zipped/latest/UCD.zip
CLDRFILES_UCD= http://www.unicode.org/Public/zipped/13.0.0/UCD.zip
# fetch and extract targets
${UNIDIR}:
@ -206,8 +158,8 @@ build-tools:
JAVA_CLDR= java -DCLDR_DIR=${UNIDIR:Q} -jar ${UNIDIR}/tools/java/cldr.jar
posix: posixcm post-posixcm posixsrc posixcol
.ORDER: posixcm post-posixcm posixsrc posixcol
posix: posixcm post-posixcm posixsrc
.ORDER: posixcm post-posixcm posixsrc
${UNIDIR}/posix:
ln -s -f ../posix ${.TARGET}
clean-posix:
@ -232,14 +184,6 @@ posix/${area}.UTF-8.src:
${JAVA_CLDR} org.unicode.cldr.posix.GeneratePOSIX \
-d posix -m ${area} -c UTF-8
.endfor
.for area encoding in ${COLLATION_SPECIAL}
posixcol: build-tools posix/${area}.${encoding}.src
.ORDER: build-tools posix/${area}.${encoding}.src
posix/${area}.${encoding}.src:
mkdir -p posix && \
${JAVA_CLDR} org.unicode.cldr.posix.GeneratePOSIX \
-d posix -m ${area} -c ${encoding}
.endfor
# generate widths.txt using the data from libut8proc
GETWIDTHS=${TOOLSDIR}/getwidths

View File

@ -28,169 +28,122 @@
-->
<language name="af"
encoding="ISO8859-1 ISO8859-15"
countries="ZA" />
<language name="am"
countries="ET" /> <!-- UTF-8 only -->
<language name="ar"
countries="AE EG JO MA QA SA" />
<language name="be"
encoding="CP1131 CP1251 ISO8859-5"
countries="BY" />
<language name="bg"
encoding="CP1251"
countries="BG" />
<language name="ca"
fallback="ca_ES"
encoding="ISO8859-1 ISO8859-15"
countries="AD ES FR IT" /> <!-- only ca_ES defined -->
<language name="cs"
encoding="ISO8859-2"
countries="CZ" />
<language name="da"
encoding="ISO8859-1 ISO8859-15"
countries="DK" />
<language name="de"
encoding="ISO8859-1 ISO8859-15"
countries="AT CH DE" />
<language name="el"
encoding="ISO8859-7"
countries="GR" />
<language name="en"
encoding="ISO8859-1 ISO8859-15 US-ASCII"
countries="GB" />
<language name="en"
encoding="ISO8859-1 ISO8859-15 US-ASCII"
countries="AU CA NZ US ZA" />
<language name="en"
encoding="ISO8859-1 ISO8859-15"
countries="IE" />
<language name="en"
encoding="ISO8859-1"
countries="HK SG" />
<language name="en"
countries="PH" /> <!-- UTF-8 only -->
<language name="es"
countries="CR" /> <!-- UTF-8 only -->
<language name="es"
encoding="ISO8859-1 ISO8859-15"
countries="ES" />
<language name="es"
encoding="ISO8859-1"
countries="AR MX" />
<language name="et"
encoding="ISO8859-1 ISO8859-15"
countries="EE" />
<language name="eu"
encoding="ISO8859-1 ISO8859-15"
countries="ES" />
<language name="fi"
encoding="ISO8859-1 ISO8859-15"
countries="FI" />
<language name="fr"
encoding="ISO8859-1 ISO8859-15"
countries="BE CH FR" />
<language name="fr"
encoding="ISO8859-1 ISO8859-15"
countries="CA" />
<language name="ga"
countries="IE" /> <!-- UTF-8 only -->
<language name="he"
countries="IL" />
<language name="hi"
encoding="ISCII-DEV"
countries="IN" />
<language name="hr"
encoding="ISO8859-2"
countries="HR" />
<language name="hu"
encoding="ISO8859-2"
countries="HU" />
<language name="hy"
encoding="ARMSCII-8"
countries="AM" />
<language name="is"
encoding="ISO8859-1 ISO8859-15"
countries="IS" />
<language name="it"
encoding="ISO8859-1 ISO8859-15"
countries="CH IT" />
<language name="ja"
encoding="SJIS eucJP"
countries="JP" />
<language name="kk"
countries="KZ" /> <!-- PT154 not available, UTF-8 -->
<language name="ko"
encoding="eucKR"
encoding_link="eucKR:CP949"
countries="KR" />
<language name="lt"
encoding="ISO8859-13"
countries="LT" />
<language name="lv"
encoding="ISO8859-13"
countries="LV" />
<language name="mn"
countries="MN" />
<language name="nb"
encoding="ISO8859-1 ISO8859-15"
countries="NO" />
<language name="nl"
encoding="ISO8859-1 ISO8859-15"
countries="BE NL" />
<language name="nn"
encoding="ISO8859-1 ISO8859-15"
countries="NO" />
<language name="pl"
encoding="ISO8859-2"
countries="PL" />
<language name="pt"
encoding="ISO8859-1 ISO8859-15"
countries="PT" />
<language name="pt"
encoding="ISO8859-1"
countries="BR" />
<language name="ro"
encoding="ISO8859-2"
countries="RO" />
<language name="ru"
encoding="CP1251 CP866 ISO8859-5 KOI8-R"
countries="RU" />
<language name="se"
countries="NO FI" />
<language name="sk"
encoding="ISO8859-2"
countries="SK" />
<language name="sl"
encoding="ISO8859-2"
countries="SI" />
<language name="sr"
family="Latn"
encoding="ISO8859-2"
countries="RS" />
<language name="sr"
family="Cyrl"
encoding="ISO8859-5"
countries="RS" />
<language name="sv"
encoding="ISO8859-1 ISO8859-15"
countries="SE FI" />
<language name="tr"
encoding="ISO8859-9"
countries="TR" />
<language name="uk"
encoding="CP1251 ISO8859-5 KOI8-U"
countries="UA" />
<language name="zh"
family="Hans"
encoding="GB18030 GB2312 GBK eucCN"
countries="CN" />
<language name="zh"
family="Hant"
countries="HK" />
<language name="zh"
family="Hant"
encoding="Big5"
countries="TW" />
</languages>

View File

@ -65,7 +65,6 @@ my %values = ();
my %hashtable = ();
my %languages = ();
my %translations = ();
my %encodings = ();
my %alternativemonths = ();
get_languages();
@ -74,7 +73,6 @@ $utfmap{'UTF-8'} = {};
$utfmap{'UTF-32'} = {};
get_utfmap("$UNIDIR/posix/$DEFENCODING.cm", $utfmap{'UTF-8'});
get_utfmap("$UNIDIR/posix/UTF-32.cm", $utfmap{'UTF-32'});
get_encodings("$ETCDIR/charmaps");
my %keys = ();
tie(%keys, "Tie::IxHash");
@ -384,44 +382,11 @@ sub resolve_enc_addition {
return $ret;
}
sub get_encodings {
my $dir = shift;
foreach my $e (sort(keys(%encodings))) {
if (!open(FIN, "$dir/$e.TXT")) {
print "Cannot open charmap for $e\n";
next;
}
$encodings{$e} = 1;
my @lines = <FIN>;
close(FIN);
chomp(@lines);
foreach my $l (@lines) {
$l =~ s/\r//;
next if ($l eq "");
my @a = split(" ", $l);
next if ($#a < 1);
next if ($a[0] =~ /^\#/ or $a[1] =~ /^\#/);
next if ($a[0] eq '' or $a[1] eq '');
$a[0] = resolve_enc_addition($a[0]); # local
$a[1] = resolve_enc_addition($a[1]); # UTF-32
my $u32 = sprintf("%08X", hex($a[1]));
# print STDERR "$a[1] => $u32\n";
# Use UTF-32 as the indices.
$convertors{$e}{$u32} = uc($a[0]);
}
}
}
sub get_languages {
my %data = get_xmldata($ETCDIR);
%languages = %{$data{L}};
%translations = %{$data{T}};
%alternativemonths = %{$data{AM}};
%encodings = %{$data{E}};
}
sub transform_ctypes {

View File

@ -82,7 +82,6 @@ for i in *_*_*.*.src; do
nname=`echo $oldname | awk '{ split($0, a, "_"); print a[1]"_"a[3]"@"a[2];} '`
mv -f ${oldname}.src ${nname}.src
sed -i '' -e "s/${oldname}/${nname}/g" Makefile
COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${oldname}/${nname}/g")
done
# For variable without @modifier ambiguity do not keep the @modifier
@ -95,7 +94,6 @@ for i in *@*.src; do
if [ $(ls ${shortname}@* | wc -l) -eq 1 ] ; then
mv -f $i ${shortname}.src
sed -i '' -e "s/${oldname}/${shortname}/g" Makefile
COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${oldname}/${shortname}/g")
fi
done
@ -106,7 +104,6 @@ for i in *@Latn.src; do
fi
mv -f ${i} ${i%@*}@latin.src
sed -i '' -e "s/${i%.*}/${i%@*}@latin/g" Makefile
COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${i%.*}/${i%@*}@latin/g")
done
for i in *@Cyrl.src; do
@ -115,7 +112,6 @@ for i in *@Cyrl.src; do
fi
mv -f ${i} ${i%@*}@cyrillic.src
sed -i '' -e "s/${i%.*}/${i%@*}@cyrillic/g" Makefile
COLLATIONS_SPECIAL=$(echo ${COLLATIONS_SPECIAL} | sed -e "s/${i%.*}/${i%@*}@cyrillic/g")
done
# On locales with multiple modifiers rename the "default" version without the @modifier
@ -150,30 +146,6 @@ then
/usr/bin/sed -E -e 's/[ ]+/ /g' \
${UNIDIR}/posix/UTF-8.cm \
> ${ETCDIR}/final-maps/map.UTF-8
/usr/bin/sed -E -e 's/[ ]+/ /g' \
${UNIDIR}/posix/eucCN.cm \
> ${ETCDIR}/final-maps/map.eucCN
/usr/bin/sed -E -e 's/[ ]+/ /g' \
${UNIDIR}/posix/eucCN.cm \
> ${ETCDIR}/final-maps/map.GB2312
# GB18030 and Big5 are pre-generated from CLDR data
CHARMAPS="ARMSCII-8 CP1131 CP1251 \
CP866 GBK ISCII-DEV ISO8859-1 \
ISO8859-13 ISO8859-15 ISO8859-2 ISO8859-4 \
ISO8859-5 ISO8859-7 ISO8859-9 KOI8-R KOI8-U \
PT154 SJIS US-ASCII eucJP eucKR"
for map in ${CHARMAPS}
do
encoding=${map}
env ETCDIR="${ETCDIR}" \
/usr/local/bin/perl ${TOOLSDIR}/convert_map.pl \
${ETCDIR}/charmaps/${map}.TXT ${encoding} \
| /usr/bin/sed -E -e 's/ +/ /g' \
> ${ETCDIR}/final-maps/map.${map}
echo map ${map} converted.
done
elif [ $1 = "colldef" ]
then
@ -190,13 +162,9 @@ then
sed -i '' "/^SAME.*$line$/d" ${old}/Makefile
done
echo "" >> ${TEMP4}
for enc in ${COLLATIONS_SPECIAL}; do
sed -i '' "/^.*${enc}$/d" ${TEMP4}
echo "LOCALES+= ${enc}" >> ${TEMP4}
done
keep=$(cat ${TEMP} | awk '{ print $2 }')
for original in ${keep} ${COLLATIONS_SPECIAL}
for original in ${keep}
do
cp ${old}/${original}.src ${new}/
done