Add a UTF-8 encoding method, which will eventually replace the antique
"UTF2" method. Although UTF-8 and the old UTF2 encoding are compatible for 16-bit characters, the new UTF-8 implementation is much more strict about rejecting malformed input and also handles the full 31 bit range of characters.
This commit is contained in:
parent
9b30d71989
commit
972baa3747
@ -11,7 +11,8 @@ SRCS+= big5.c btowc.c collate.c collcmp.c euc.c fix_grouping.c frune.c \
|
||||
mbrtowc.c mbrune.c mbsinit.c mbsrtowcs.c mbtowc.c mbstowcs.c \
|
||||
mskanji.c nl_langinfo.c nomacros.c none.c rune.c \
|
||||
runetype.c setinvalidrune.c setlocale.c setrunelocale.c table.c \
|
||||
tolower.c toupper.c utf2.c wcrtomb.c wcsrtombs.c wcsftime.c wcstod.c \
|
||||
tolower.c toupper.c utf2.c utf8.c wcrtomb.c wcsrtombs.c wcsftime.c \
|
||||
wcstod.c \
|
||||
wcstoimax.c wcstol.c wcstoll.c \
|
||||
wcstombs.c \
|
||||
wcstoul.c wcstoull.c wcstoumax.c wctob.c wctomb.c wctrans.c wctype.c \
|
||||
@ -31,6 +32,7 @@ MAN+= btowc.3 \
|
||||
wcsrtombs.3 wcstod.3 wcstol.3 \
|
||||
wctrans.3 wctype.3 wcwidth.3
|
||||
MAN+= euc.4 utf2.4
|
||||
MAN+= utf8.5
|
||||
|
||||
MLINKS+=btowc.3 wctob.3
|
||||
MLINKS+=isdigit.3 isnumber.3
|
||||
|
@ -161,7 +161,8 @@ does not appear in the string.
|
||||
.Xr rune 3 ,
|
||||
.Xr setlocale 3 ,
|
||||
.Xr euc 4 ,
|
||||
.Xr utf2 4
|
||||
.Xr utf2 4 ,
|
||||
.Xr utf8 5
|
||||
.Sh HISTORY
|
||||
The
|
||||
.Fn mbrune ,
|
||||
|
@ -232,7 +232,8 @@ both functions return \-1.
|
||||
.Xr wcrtomb 3 ,
|
||||
.Xr wcsrtombs 3 ,
|
||||
.Xr euc 4 ,
|
||||
.Xr utf2 4
|
||||
.Xr utf2 4 ,
|
||||
.Xr utf8 5
|
||||
.Sh STANDARDS
|
||||
The
|
||||
.Fn mblen ,
|
||||
|
@ -265,7 +265,8 @@ binary LC_CTYPE file for the locale
|
||||
.Xr mbrune 3 ,
|
||||
.Xr setlocale 3 ,
|
||||
.Xr euc 4 ,
|
||||
.Xr utf2 4
|
||||
.Xr utf2 4 ,
|
||||
.Xr utf8 5
|
||||
.Sh HISTORY
|
||||
These functions first appeared in
|
||||
.Bx 4.4 .
|
||||
|
@ -334,7 +334,8 @@ and the category
|
||||
.Xr strcoll 3 ,
|
||||
.Xr strxfrm 3 ,
|
||||
.Xr euc 4 ,
|
||||
.Xr utf2 4
|
||||
.Xr utf2 4 ,
|
||||
.Xr utf8 5
|
||||
.Sh STANDARDS
|
||||
The
|
||||
.Fn setlocale
|
||||
|
@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
|
||||
|
||||
extern int _none_init(_RuneLocale *);
|
||||
extern int _UTF2_init(_RuneLocale *);
|
||||
extern int _UTF8_init(_RuneLocale *);
|
||||
extern int _EUC_init(_RuneLocale *);
|
||||
extern int _BIG5_init(_RuneLocale *);
|
||||
extern int _MSKanji_init(_RuneLocale *);
|
||||
@ -130,6 +131,8 @@ setrunelocale(char *encoding)
|
||||
ret = _none_init(rl);
|
||||
else if (strcmp(rl->encoding, "UTF2") == 0)
|
||||
ret = _UTF2_init(rl);
|
||||
else if (strcmp(rl->encoding, "UTF-8") == 0)
|
||||
ret = _UTF8_init(rl);
|
||||
else if (strcmp(rl->encoding, "EUC") == 0)
|
||||
ret = _EUC_init(rl);
|
||||
else if (strcmp(rl->encoding, "BIG5") == 0)
|
||||
|
@ -35,7 +35,7 @@
|
||||
.\" @(#)utf2.4 8.1 (Berkeley) 6/4/93
|
||||
.\" $FreeBSD$
|
||||
.\"
|
||||
.Dd June 4, 1993
|
||||
.Dd October 11, 2002
|
||||
.Dt UTF2 4
|
||||
.Os
|
||||
.Sh NAME
|
||||
@ -45,6 +45,11 @@
|
||||
.Nm ENCODING
|
||||
.Qq UTF2
|
||||
.Sh DESCRIPTION
|
||||
.Bf Em
|
||||
The UTF2 encoding has been deprecated in favour of UTF-8.
|
||||
.Ef
|
||||
New applications should not use UTF2.
|
||||
.Pp
|
||||
The
|
||||
.Nm UTF2
|
||||
encoding is based on a proposed X-Open multibyte
|
||||
@ -85,4 +90,5 @@ which provides for the entire proposed ISO-10646 31 bit standard are currently
|
||||
not implemented.
|
||||
.Sh "SEE ALSO"
|
||||
.Xr mklocale 1 ,
|
||||
.Xr setlocale 3
|
||||
.Xr setlocale 3 ,
|
||||
.Xr utf8 5
|
||||
|
@ -35,7 +35,7 @@
|
||||
.\" @(#)utf2.4 8.1 (Berkeley) 6/4/93
|
||||
.\" $FreeBSD$
|
||||
.\"
|
||||
.Dd June 4, 1993
|
||||
.Dd October 11, 2002
|
||||
.Dt UTF2 4
|
||||
.Os
|
||||
.Sh NAME
|
||||
@ -45,6 +45,11 @@
|
||||
.Nm ENCODING
|
||||
.Qq UTF2
|
||||
.Sh DESCRIPTION
|
||||
.Bf Em
|
||||
The UTF2 encoding has been deprecated in favour of UTF-8.
|
||||
.Ef
|
||||
New applications should not use UTF2.
|
||||
.Pp
|
||||
The
|
||||
.Nm UTF2
|
||||
encoding is based on a proposed X-Open multibyte
|
||||
@ -85,4 +90,5 @@ which provides for the entire proposed ISO-10646 31 bit standard are currently
|
||||
not implemented.
|
||||
.Sh "SEE ALSO"
|
||||
.Xr mklocale 1 ,
|
||||
.Xr setlocale 3
|
||||
.Xr setlocale 3 ,
|
||||
.Xr utf8 5
|
||||
|
115
lib/libc/locale/utf8.5
Normal file
115
lib/libc/locale/utf8.5
Normal file
@ -0,0 +1,115 @@
|
||||
.\" Copyright (c) 1993
|
||||
.\" The Regents of the University of California. All rights reserved.
|
||||
.\"
|
||||
.\" This code is derived from software contributed to Berkeley by
|
||||
.\" Paul Borman at Krystal Technologies.
|
||||
.\"
|
||||
.\" Redistribution and use in source and binary forms, with or without
|
||||
.\" modification, are permitted provided that the following conditions
|
||||
.\" are met:
|
||||
.\" 1. Redistributions of source code must retain the above copyright
|
||||
.\" notice, this list of conditions and the following disclaimer.
|
||||
.\" 2. Redistributions in binary form must reproduce the above copyright
|
||||
.\" notice, this list of conditions and the following disclaimer in the
|
||||
.\" documentation and/or other materials provided with the distribution.
|
||||
.\" 3. All advertising materials mentioning features or use of this software
|
||||
.\" must display the following acknowledgement:
|
||||
.\" This product includes software developed by the University of
|
||||
.\" California, Berkeley and its contributors.
|
||||
.\" 4. Neither the name of the University nor the names of its contributors
|
||||
.\" may be used to endorse or promote products derived from this software
|
||||
.\" without specific prior written permission.
|
||||
.\"
|
||||
.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
.\" SUCH DAMAGE.
|
||||
.\"
|
||||
.\" @(#)utf2.4 8.1 (Berkeley) 6/4/93
|
||||
.\" $FreeBSD$
|
||||
.\"
|
||||
.Dd October 10, 2002
|
||||
.Dt UTF8 5
|
||||
.Os
|
||||
.Sh NAME
|
||||
.Nm utf8
|
||||
.Nd "UTF-8, a transformation format of ISO 10646"
|
||||
.Sh SYNOPSIS
|
||||
.Nm ENCODING
|
||||
.Qq UTF-8
|
||||
.Sh DESCRIPTION
|
||||
The
|
||||
.Nm UTF-8
|
||||
encoding represents UCS-4 characters as a sequence of octets, using
|
||||
between 1 and 6 for each character.
|
||||
It is backwards compatible with
|
||||
.Tn ASCII ,
|
||||
so 0x00-0x7f refer to the
|
||||
.Tn ASCII
|
||||
character set.
|
||||
The multibyte encoding of non-
|
||||
.Tn ASCII
|
||||
characters
|
||||
consist entirely of bytes whose high order bit is set.
|
||||
The actual
|
||||
encoding is represented by the following table:
|
||||
.Bd -literal
|
||||
[0x00000000 - 0x0000007f] [00000000.0bbbbbbb] -> 0bbbbbbb
|
||||
[0x00000080 - 0x000007ff] [00000bbb.bbbbbbbb] -> 110bbbbb, 10bbbbbb
|
||||
[0x00000800 - 0x0000ffff] [bbbbbbbb.bbbbbbbb] ->
|
||||
1110bbbb, 10bbbbbb, 10bbbbbb
|
||||
[0x00010000 - 0x001fffff] [00000000.000bbbbb.bbbbbbbb.bbbbbbbb] ->
|
||||
11110bbb, 10bbbbbb, 10bbbbbb, 10bbbbbb
|
||||
[0x00200000 - 0x03ffffff] [000000bb.bbbbbbbb.bbbbbbbb.bbbbbbbb] ->
|
||||
111110bb, 10bbbbbb, 10bbbbbb, 10bbbbbb, 10bbbbbb
|
||||
[0x04000000 - 0x7fffffff] [0bbbbbbb.bbbbbbbb.bbbbbbbb.bbbbbbbb] ->
|
||||
1111110b, 10bbbbbb, 10bbbbbb, 10bbbbbb, 10bbbbbb, 10bbbbbb
|
||||
.Ed
|
||||
.Pp
|
||||
If more than a single representation of a value exists (for example,
|
||||
0x00; 0xC0 0x80; 0xE0 0x80 0x80) the shortest representation is always
|
||||
used.
|
||||
Longer ones are detected as an error as they pose a potential
|
||||
security risk, and destroy the 1:1 character:octet sequence mapping.
|
||||
.Sh COMPATIBILITY
|
||||
The
|
||||
.Nm
|
||||
encoding supersedes the
|
||||
.Xr utf2 4
|
||||
encoding.
|
||||
The only differences between the two are that
|
||||
.Nm
|
||||
handles the full 31-bit character set of
|
||||
.Tn ISO
|
||||
10646
|
||||
whereas
|
||||
.Xr utf2 4
|
||||
is limited to a 16-bit character set,
|
||||
and that
|
||||
.Xr utf2 4
|
||||
accepts redundant, non-"shortest form" representations of characters.
|
||||
.Sh SEE ALSO
|
||||
.Xr euc 4 ,
|
||||
.Xr utf2 4
|
||||
.Rs
|
||||
.%A "F. Yergeau"
|
||||
.%T "UTF-8, a transformation format of ISO 10646"
|
||||
.%O "RFC 2279"
|
||||
.%D "January 1998"
|
||||
.Re
|
||||
.Sh STANDARDS
|
||||
The
|
||||
.Nm
|
||||
encoding is compatible with RFC 2279.
|
||||
.Sh BUGS
|
||||
Byte order marker (BOM) characters are neither added nor removed
|
||||
from UTF-8-encoded wide character
|
||||
.Xr stdio 3
|
||||
streams.
|
204
lib/libc/locale/utf8.c
Normal file
204
lib/libc/locale/utf8.c
Normal file
@ -0,0 +1,204 @@
|
||||
/*-
|
||||
* Copyright (c) 2002 Tim J. Robbins
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#include <sys/cdefs.h>
|
||||
__FBSDID("$FreeBSD$");
|
||||
|
||||
#include <rune.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
rune_t _UTF8_sgetrune(const char *, size_t, char const **);
|
||||
int _UTF8_sputrune(rune_t, char *, size_t, char **);
|
||||
|
||||
int
|
||||
_UTF8_init(_RuneLocale *rl)
|
||||
{
|
||||
|
||||
rl->sgetrune = _UTF8_sgetrune;
|
||||
rl->sputrune = _UTF8_sputrune;
|
||||
_CurrentRuneLocale = rl;
|
||||
__mb_cur_max = 6;
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
rune_t
|
||||
_UTF8_sgetrune(const char *string, size_t n, const char **result)
|
||||
{
|
||||
int ch, len, mask, siglen;
|
||||
rune_t lbound, wch;
|
||||
|
||||
if (n < 1) {
|
||||
if (result != NULL)
|
||||
*result = string;
|
||||
return (_INVALID_RUNE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Determine the number of octets that make up this character from
|
||||
* the first octet, and a mask that extracts the interesting bits of
|
||||
* the first octet.
|
||||
*
|
||||
* We also specify a lower bound for the character code to detect
|
||||
* redundant, non-"shortest form" encodings. For example, the
|
||||
* sequence C0 80 is _not_ a legal representation of the null
|
||||
* character. This enforces a 1-to-1 mapping between character
|
||||
* codes and their multibyte representations.
|
||||
*/
|
||||
ch = (unsigned char)*string;
|
||||
if ((ch & 0x80) == 0) {
|
||||
mask = 0x7f;
|
||||
len = 1;
|
||||
lbound = 0;
|
||||
} else if ((ch & 0xe0) == 0xc0) {
|
||||
mask = 0x1f;
|
||||
len = 2;
|
||||
lbound = 0x80;
|
||||
} else if ((ch & 0xf0) == 0xe0) {
|
||||
mask = 0x0f;
|
||||
len = 3;
|
||||
lbound = 0x800;
|
||||
} else if ((ch & 0xf8) == 0xf0) {
|
||||
mask = 0x07;
|
||||
len = 4;
|
||||
lbound = 0x10000;
|
||||
} else if ((ch & 0xfc) == 0xf8) {
|
||||
mask = 0x03;
|
||||
len = 5;
|
||||
lbound = 0x200000;
|
||||
} else if ((ch & 0xfc) == 0xfc) {
|
||||
mask = 0x01;
|
||||
len = 6;
|
||||
lbound = 0x4000000;
|
||||
} else {
|
||||
/*
|
||||
* Malformed input; input is not UTF-8.
|
||||
*/
|
||||
if (result != NULL)
|
||||
*result = string + 1;
|
||||
return (_INVALID_RUNE);
|
||||
}
|
||||
|
||||
if (n < len) {
|
||||
/*
|
||||
* Truncated or partial input.
|
||||
*/
|
||||
if (result != NULL)
|
||||
*result = string;
|
||||
return (_INVALID_RUNE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Decode the octet sequence representing the character in chunks
|
||||
* of 6 bits, most significant first.
|
||||
*/
|
||||
wch = (unsigned char)*string++ & mask;
|
||||
while (--len != 0) {
|
||||
if ((*string & 0xc0) != 0x80) {
|
||||
/*
|
||||
* Malformed input; bad characters in the middle
|
||||
* of a character.
|
||||
*/
|
||||
wch = _INVALID_RUNE;
|
||||
if (result != NULL)
|
||||
*result = string + 1;
|
||||
return (_INVALID_RUNE);
|
||||
}
|
||||
wch <<= 6;
|
||||
wch |= *string++ & 0x3f;
|
||||
}
|
||||
if (wch != _INVALID_RUNE && wch < lbound)
|
||||
/*
|
||||
* Malformed input; redundant encoding.
|
||||
*/
|
||||
wch = _INVALID_RUNE;
|
||||
if (result != NULL)
|
||||
*result = string;
|
||||
return (wch);
|
||||
}
|
||||
|
||||
int
|
||||
_UTF8_sputrune(rune_t c, char *string, size_t n, char **result)
|
||||
{
|
||||
unsigned char lead;
|
||||
int i, len;
|
||||
|
||||
/*
|
||||
* Determine the number of octets needed to represent this character.
|
||||
* We always output the shortest sequence possible. Also specify the
|
||||
* first few bits of the first octet, which contains the information
|
||||
* about the sequence length.
|
||||
*/
|
||||
if ((c & ~0x7f) == 0) {
|
||||
lead = 0;
|
||||
len = 1;
|
||||
} else if ((c & ~0x7ff) == 0) {
|
||||
lead = 0xc0;
|
||||
len = 2;
|
||||
} else if ((c & ~0xffff) == 0) {
|
||||
lead = 0xe0;
|
||||
len = 3;
|
||||
} else if ((c & ~0x1fffff) == 0) {
|
||||
lead = 0xf0;
|
||||
len = 4;
|
||||
} else if ((c & ~0x3ffffff) == 0) {
|
||||
lead = 0xf8;
|
||||
len = 5;
|
||||
} else if ((c & ~0x7fffffff) == 0) {
|
||||
lead = 0xfc;
|
||||
len = 6;
|
||||
} else {
|
||||
/*
|
||||
* Wide character code is out of range.
|
||||
*/
|
||||
if (result != NULL)
|
||||
*result = NULL;
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (n < len) {
|
||||
if (result != NULL)
|
||||
*result = NULL;
|
||||
} else {
|
||||
/*
|
||||
* Output the octets representing the character in chunks
|
||||
* of 6 bits, least significant last. The first octet is
|
||||
* a special case because it contains the sequence length
|
||||
* information.
|
||||
*/
|
||||
for (i = len - 1; i > 0; i--) {
|
||||
string[i] = (c & 0x3f) | 0x80;
|
||||
c >>= 6;
|
||||
}
|
||||
*string = (c & 0xff) | lead;
|
||||
if (result != NULL)
|
||||
*result = string + len;
|
||||
}
|
||||
|
||||
return (len);
|
||||
}
|
Loading…
Reference in New Issue
Block a user