Add a UTF-8 encoding method, which will eventually replace the antique

"UTF2" method. Although UTF-8 and the old UTF2 encoding are compatible
for 16-bit characters, the new UTF-8 implementation is much more strict
about rejecting malformed input and also handles the full 31 bit range
of characters.
This commit is contained in:
Tim J. Robbins 2002-10-10 22:56:18 +00:00
parent 9b30d71989
commit 972baa3747
10 changed files with 349 additions and 9 deletions

View File

@ -11,7 +11,8 @@ SRCS+= big5.c btowc.c collate.c collcmp.c euc.c fix_grouping.c frune.c \
mbrtowc.c mbrune.c mbsinit.c mbsrtowcs.c mbtowc.c mbstowcs.c \
mskanji.c nl_langinfo.c nomacros.c none.c rune.c \
runetype.c setinvalidrune.c setlocale.c setrunelocale.c table.c \
tolower.c toupper.c utf2.c wcrtomb.c wcsrtombs.c wcsftime.c wcstod.c \
tolower.c toupper.c utf2.c utf8.c wcrtomb.c wcsrtombs.c wcsftime.c \
wcstod.c \
wcstoimax.c wcstol.c wcstoll.c \
wcstombs.c \
wcstoul.c wcstoull.c wcstoumax.c wctob.c wctomb.c wctrans.c wctype.c \
@ -31,6 +32,7 @@ MAN+= btowc.3 \
wcsrtombs.3 wcstod.3 wcstol.3 \
wctrans.3 wctype.3 wcwidth.3
MAN+= euc.4 utf2.4
MAN+= utf8.5
MLINKS+=btowc.3 wctob.3
MLINKS+=isdigit.3 isnumber.3

View File

@ -161,7 +161,8 @@ does not appear in the string.
.Xr rune 3 ,
.Xr setlocale 3 ,
.Xr euc 4 ,
.Xr utf2 4
.Xr utf2 4 ,
.Xr utf8 5
.Sh HISTORY
The
.Fn mbrune ,

View File

@ -232,7 +232,8 @@ both functions return \-1.
.Xr wcrtomb 3 ,
.Xr wcsrtombs 3 ,
.Xr euc 4 ,
.Xr utf2 4
.Xr utf2 4 ,
.Xr utf8 5
.Sh STANDARDS
The
.Fn mblen ,

View File

@ -265,7 +265,8 @@ binary LC_CTYPE file for the locale
.Xr mbrune 3 ,
.Xr setlocale 3 ,
.Xr euc 4 ,
.Xr utf2 4
.Xr utf2 4 ,
.Xr utf8 5
.Sh HISTORY
These functions first appeared in
.Bx 4.4 .

View File

@ -334,7 +334,8 @@ and the category
.Xr strcoll 3 ,
.Xr strxfrm 3 ,
.Xr euc 4 ,
.Xr utf2 4
.Xr utf2 4 ,
.Xr utf8 5
.Sh STANDARDS
The
.Fn setlocale

View File

@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
extern int _none_init(_RuneLocale *);
extern int _UTF2_init(_RuneLocale *);
extern int _UTF8_init(_RuneLocale *);
extern int _EUC_init(_RuneLocale *);
extern int _BIG5_init(_RuneLocale *);
extern int _MSKanji_init(_RuneLocale *);
@ -130,6 +131,8 @@ setrunelocale(char *encoding)
ret = _none_init(rl);
else if (strcmp(rl->encoding, "UTF2") == 0)
ret = _UTF2_init(rl);
else if (strcmp(rl->encoding, "UTF-8") == 0)
ret = _UTF8_init(rl);
else if (strcmp(rl->encoding, "EUC") == 0)
ret = _EUC_init(rl);
else if (strcmp(rl->encoding, "BIG5") == 0)

View File

@ -35,7 +35,7 @@
.\" @(#)utf2.4 8.1 (Berkeley) 6/4/93
.\" $FreeBSD$
.\"
.Dd June 4, 1993
.Dd October 11, 2002
.Dt UTF2 4
.Os
.Sh NAME
@ -45,6 +45,11 @@
.Nm ENCODING
.Qq UTF2
.Sh DESCRIPTION
.Bf Em
The UTF2 encoding has been deprecated in favour of UTF-8.
.Ef
New applications should not use UTF2.
.Pp
The
.Nm UTF2
encoding is based on a proposed X-Open multibyte
@ -85,4 +90,5 @@ which provides for the entire proposed ISO-10646 31 bit standard are currently
not implemented.
.Sh "SEE ALSO"
.Xr mklocale 1 ,
.Xr setlocale 3
.Xr setlocale 3 ,
.Xr utf8 5

View File

@ -35,7 +35,7 @@
.\" @(#)utf2.4 8.1 (Berkeley) 6/4/93
.\" $FreeBSD$
.\"
.Dd June 4, 1993
.Dd October 11, 2002
.Dt UTF2 4
.Os
.Sh NAME
@ -45,6 +45,11 @@
.Nm ENCODING
.Qq UTF2
.Sh DESCRIPTION
.Bf Em
The UTF2 encoding has been deprecated in favour of UTF-8.
.Ef
New applications should not use UTF2.
.Pp
The
.Nm UTF2
encoding is based on a proposed X-Open multibyte
@ -85,4 +90,5 @@ which provides for the entire proposed ISO-10646 31 bit standard are currently
not implemented.
.Sh "SEE ALSO"
.Xr mklocale 1 ,
.Xr setlocale 3
.Xr setlocale 3 ,
.Xr utf8 5

115
lib/libc/locale/utf8.5 Normal file
View File

@ -0,0 +1,115 @@
.\" Copyright (c) 1993
.\" The Regents of the University of California. All rights reserved.
.\"
.\" This code is derived from software contributed to Berkeley by
.\" Paul Borman at Krystal Technologies.
.\"
.\" Redistribution and use in source and binary forms, with or without
.\" modification, are permitted provided that the following conditions
.\" are met:
.\" 1. Redistributions of source code must retain the above copyright
.\" notice, this list of conditions and the following disclaimer.
.\" 2. Redistributions in binary form must reproduce the above copyright
.\" notice, this list of conditions and the following disclaimer in the
.\" documentation and/or other materials provided with the distribution.
.\" 3. All advertising materials mentioning features or use of this software
.\" must display the following acknowledgement:
.\" This product includes software developed by the University of
.\" California, Berkeley and its contributors.
.\" 4. Neither the name of the University nor the names of its contributors
.\" may be used to endorse or promote products derived from this software
.\" without specific prior written permission.
.\"
.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
.\" @(#)utf2.4 8.1 (Berkeley) 6/4/93
.\" $FreeBSD$
.\"
.Dd October 10, 2002
.Dt UTF8 5
.Os
.Sh NAME
.Nm utf8
.Nd "UTF-8, a transformation format of ISO 10646"
.Sh SYNOPSIS
.Nm ENCODING
.Qq UTF-8
.Sh DESCRIPTION
The
.Nm UTF-8
encoding represents UCS-4 characters as a sequence of octets, using
between 1 and 6 for each character.
It is backwards compatible with
.Tn ASCII ,
so 0x00-0x7f refer to the
.Tn ASCII
character set.
The multibyte encoding of non-
.Tn ASCII
characters
consist entirely of bytes whose high order bit is set.
The actual
encoding is represented by the following table:
.Bd -literal
[0x00000000 - 0x0000007f] [00000000.0bbbbbbb] -> 0bbbbbbb
[0x00000080 - 0x000007ff] [00000bbb.bbbbbbbb] -> 110bbbbb, 10bbbbbb
[0x00000800 - 0x0000ffff] [bbbbbbbb.bbbbbbbb] ->
1110bbbb, 10bbbbbb, 10bbbbbb
[0x00010000 - 0x001fffff] [00000000.000bbbbb.bbbbbbbb.bbbbbbbb] ->
11110bbb, 10bbbbbb, 10bbbbbb, 10bbbbbb
[0x00200000 - 0x03ffffff] [000000bb.bbbbbbbb.bbbbbbbb.bbbbbbbb] ->
111110bb, 10bbbbbb, 10bbbbbb, 10bbbbbb, 10bbbbbb
[0x04000000 - 0x7fffffff] [0bbbbbbb.bbbbbbbb.bbbbbbbb.bbbbbbbb] ->
1111110b, 10bbbbbb, 10bbbbbb, 10bbbbbb, 10bbbbbb, 10bbbbbb
.Ed
.Pp
If more than a single representation of a value exists (for example,
0x00; 0xC0 0x80; 0xE0 0x80 0x80) the shortest representation is always
used.
Longer ones are detected as an error as they pose a potential
security risk, and destroy the 1:1 character:octet sequence mapping.
.Sh COMPATIBILITY
The
.Nm
encoding supersedes the
.Xr utf2 4
encoding.
The only differences between the two are that
.Nm
handles the full 31-bit character set of
.Tn ISO
10646
whereas
.Xr utf2 4
is limited to a 16-bit character set,
and that
.Xr utf2 4
accepts redundant, non-"shortest form" representations of characters.
.Sh SEE ALSO
.Xr euc 4 ,
.Xr utf2 4
.Rs
.%A "F. Yergeau"
.%T "UTF-8, a transformation format of ISO 10646"
.%O "RFC 2279"
.%D "January 1998"
.Re
.Sh STANDARDS
The
.Nm
encoding is compatible with RFC 2279.
.Sh BUGS
Byte order marker (BOM) characters are neither added nor removed
from UTF-8-encoded wide character
.Xr stdio 3
streams.

204
lib/libc/locale/utf8.c Normal file
View File

@ -0,0 +1,204 @@
/*-
* Copyright (c) 2002 Tim J. Robbins
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <rune.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
rune_t _UTF8_sgetrune(const char *, size_t, char const **);
int _UTF8_sputrune(rune_t, char *, size_t, char **);
int
_UTF8_init(_RuneLocale *rl)
{
rl->sgetrune = _UTF8_sgetrune;
rl->sputrune = _UTF8_sputrune;
_CurrentRuneLocale = rl;
__mb_cur_max = 6;
return (0);
}
rune_t
_UTF8_sgetrune(const char *string, size_t n, const char **result)
{
int ch, len, mask, siglen;
rune_t lbound, wch;
if (n < 1) {
if (result != NULL)
*result = string;
return (_INVALID_RUNE);
}
/*
* Determine the number of octets that make up this character from
* the first octet, and a mask that extracts the interesting bits of
* the first octet.
*
* We also specify a lower bound for the character code to detect
* redundant, non-"shortest form" encodings. For example, the
* sequence C0 80 is _not_ a legal representation of the null
* character. This enforces a 1-to-1 mapping between character
* codes and their multibyte representations.
*/
ch = (unsigned char)*string;
if ((ch & 0x80) == 0) {
mask = 0x7f;
len = 1;
lbound = 0;
} else if ((ch & 0xe0) == 0xc0) {
mask = 0x1f;
len = 2;
lbound = 0x80;
} else if ((ch & 0xf0) == 0xe0) {
mask = 0x0f;
len = 3;
lbound = 0x800;
} else if ((ch & 0xf8) == 0xf0) {
mask = 0x07;
len = 4;
lbound = 0x10000;
} else if ((ch & 0xfc) == 0xf8) {
mask = 0x03;
len = 5;
lbound = 0x200000;
} else if ((ch & 0xfc) == 0xfc) {
mask = 0x01;
len = 6;
lbound = 0x4000000;
} else {
/*
* Malformed input; input is not UTF-8.
*/
if (result != NULL)
*result = string + 1;
return (_INVALID_RUNE);
}
if (n < len) {
/*
* Truncated or partial input.
*/
if (result != NULL)
*result = string;
return (_INVALID_RUNE);
}
/*
* Decode the octet sequence representing the character in chunks
* of 6 bits, most significant first.
*/
wch = (unsigned char)*string++ & mask;
while (--len != 0) {
if ((*string & 0xc0) != 0x80) {
/*
* Malformed input; bad characters in the middle
* of a character.
*/
wch = _INVALID_RUNE;
if (result != NULL)
*result = string + 1;
return (_INVALID_RUNE);
}
wch <<= 6;
wch |= *string++ & 0x3f;
}
if (wch != _INVALID_RUNE && wch < lbound)
/*
* Malformed input; redundant encoding.
*/
wch = _INVALID_RUNE;
if (result != NULL)
*result = string;
return (wch);
}
int
_UTF8_sputrune(rune_t c, char *string, size_t n, char **result)
{
unsigned char lead;
int i, len;
/*
* Determine the number of octets needed to represent this character.
* We always output the shortest sequence possible. Also specify the
* first few bits of the first octet, which contains the information
* about the sequence length.
*/
if ((c & ~0x7f) == 0) {
lead = 0;
len = 1;
} else if ((c & ~0x7ff) == 0) {
lead = 0xc0;
len = 2;
} else if ((c & ~0xffff) == 0) {
lead = 0xe0;
len = 3;
} else if ((c & ~0x1fffff) == 0) {
lead = 0xf0;
len = 4;
} else if ((c & ~0x3ffffff) == 0) {
lead = 0xf8;
len = 5;
} else if ((c & ~0x7fffffff) == 0) {
lead = 0xfc;
len = 6;
} else {
/*
* Wide character code is out of range.
*/
if (result != NULL)
*result = NULL;
return (0);
}
if (n < len) {
if (result != NULL)
*result = NULL;
} else {
/*
* Output the octets representing the character in chunks
* of 6 bits, least significant last. The first octet is
* a special case because it contains the sequence length
* information.
*/
for (i = len - 1; i > 0; i--) {
string[i] = (c & 0x3f) | 0x80;
c >>= 6;
}
*string = (c & 0xff) | lead;
if (result != NULL)
*result = string + len;
}
return (len);
}