update wcwidth data from utf8proc

Character width data being out of date is a constant source
of weird rendering issues and wasted time trying to diagnose
those, e.g. as reported by Jeremy Chadwick:

https://gitlab.com/muttmua/mutt/-/issues/67

Sadly, there is no real ("standard") wcwidth data source, so
this tries to rectify the problem using the utf8proc one (through
its C API) which would hopefully benefeat both FreeBSD and
utf8proc through bug reports (if any).

Reviewed by:	bapt
Differential Revision:	https://reviews.freebsd.org/D27259
This commit is contained in:
yuripv 2020-12-06 16:44:41 +00:00
parent f7b0aedd1c
commit b803311a18
5 changed files with 71852 additions and 433 deletions

View File

@ -225,7 +225,7 @@ posix/${enc}.cm:
.for area in ${BASE_LOCALES_OF_INTEREST}
posixsrc: build-tools posix/${area}.UTF-8.src
.ORDER: build-tools posix/${area}.UTF-8.src
posix/${area}.UTF-8.src:
posix/${area}.UTF-8.src:
mkdir -p posix && \
${JAVA_CLDR} org.unicode.cldr.posix.GeneratePOSIX \
-d posix -m ${area} -c UTF-8
@ -239,4 +239,20 @@ posix/${area}.${encoding}.src:
-d posix -m ${area} -c ${encoding}
.endfor
# generate widths.txt using the data from libut8proc
GETWIDTHS=${TOOLSDIR}/getwidths
MKWIDTHS=${TOOLSDIR}/mkwidths.pl
WIDTHS= ${ETCDIR}/final-maps/widths.txt
U8CFLAGS!=pkgconf --cflags libutf8proc
U8LIBS!=pkgconf --libs libutf8proc
CFLAGS+=${U8CFLAGS}
LDFLAGS+=${U8LIBS}
CLEANFILES+=${TOOLSDIR}/getwidths
widths: ${WIDTHS}
${WIDTHS}: posixcm ${GETWIDTHS}
${GETWIDTHS} | ${MKWIDTHS} ${.OBJDIR}/posix/UTF-8.cm ${.TARGET}
.include <bsd.obj.mk>

View File

@ -55,4 +55,8 @@ Targets:
make install
Install the build results into $LOCALESRCDIR.
make widths
Generate widths.txt. Requires pkgconf and utf8proc
packages to be installed.
[EOF]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,51 @@
/*-
* Copyright 2020 Yuri Pankov <yuripv@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#include <locale.h>
#include <stdio.h>
#include <utf8proc.h>
int
main(void)
{
int32_t wc;
int i, wcw;
setlocale(LC_CTYPE, "C.UTF-8");
printf("%s\n", utf8proc_version());
for (wc = 0; wc < 0x110000; wc++) {
wcw = utf8proc_charwidth(wc);
if (wcw == 1)
continue;
printf("%04X %d\n", wc, wcw);
}
return (0);
}

View File

@ -0,0 +1,125 @@
#!/usr/local/bin/perl -w
# SPDX-License-Identifier: BSD-2-Clause-FreeBSD
#
# Copyright 2009 Edwin Groothuis <edwin@FreeBSD.org>
# Copyright 2015 John Marino <draco@marino.st>
# Copyright 2020 Yuri Pankov <yuripv@FreeBSD.org>
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
# $FreeBSD$
use strict;
use Encode qw(encode decode);
my %utf8map = ();
my $utf8charmap = "$ARGV[0]";
my $outfilename = "$ARGV[1]";
get_utf8map("$utf8charmap");
generate_header();
make_widths("$outfilename");
generate_footer();
############################
sub utf8to32 {
my @kl = split /\\x/, $_[0];
shift @kl if ($kl[0] eq '');
my $k = pack('H2' x scalar @kl, @kl);
my $ux = encode('UTF-32BE', decode('UTF-8', $k));
my $u = uc(unpack('H*', $ux));
# Remove BOM
$u =~ s/^0000FEFF//;
# Remove heading bytes of 0
while ($u =~ m/^0/ and length($u) > 4) {
$u =~ s/^0//;
}
return $u;
}
sub get_utf8map {
my $file = shift;
open(FIN, $file);
my @lines = <FIN>;
close(FIN);
chomp(@lines);
my $incharmap = 0;
foreach my $l (@lines) {
$l =~ s/\r//;
next if ($l =~ /^\#/);
next if ($l eq "");
if ($l eq "CHARMAP") {
$incharmap = 1;
next;
}
next if (!$incharmap);
last if ($l eq "END CHARMAP");
$l =~ /^(<[^\s]+>)\s+(.*)/;
my $k = utf8to32($2); # UTF-8 char code
my $v = $1;
# print STDERR "register: $k - $v\n";
$utf8map{$k} = $v;
}
}
sub generate_header {
my $version = <STDIN>;
chomp($version);
open(FOUT, ">", "$outfilename")
or die ("can't write to $outfilename\n");
print FOUT <<EOF;
# Warning: Do not edit. This file is automatically generated from the
# tools in /usr/src/tools/tools/locale. The data is obtained from the
# utf8proc $version.
# -----------------------------------------------------------------------------
WIDTH
EOF
}
sub generate_footer {
print FOUT "END WIDTH\n";
close (FOUT);
}
sub make_widths {
my @lines = <STDIN>;
chomp(@lines);
foreach my $l (@lines) {
my ($wc, $wcw) = split(/ /, $l, -1);
next if !defined $utf8map{$wc};
print FOUT "$utf8map{$wc}\t$wcw\n";
}
}