freebsd-dev/lib/libc/locale/utf8.c
Pedro F. Giffuni d915a14ef0 libc: further adoption of SPDX licensing ID tags.
Mainly focus on files that use BSD 2-Clause license, however the tool I
was using mis-identified many licenses so this was mostly a manual - error
prone - task.

The Software Package Data Exchange (SPDX) group provides a specification
to make it easier for automated tools to detect and summarize well known
opensource licenses. We are gradually adopting the specification, noting
that the tags are considered only advisory and do not, in any way,
superceed or replace the license texts.
2017-11-25 17:12:48 +00:00

427 lines
10 KiB
C

/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright 2013 Garrett D'Amore <garrett@damore.org>
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2002-2004 Tim J. Robbins
* All rights reserved.
*
* Copyright (c) 2011 The FreeBSD Foundation
* All rights reserved.
* Portions of this software were developed by David Chisnall
* under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/param.h>
__FBSDID("$FreeBSD$");
#include <errno.h>
#include <limits.h>
#include <runetype.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include "mblocal.h"
extern int __mb_sb_limit;
static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
size_t, mbstate_t * __restrict);
static int _UTF8_mbsinit(const mbstate_t *);
static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
const char ** __restrict, size_t, size_t,
mbstate_t * __restrict);
static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
mbstate_t * __restrict);
static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
size_t, size_t, mbstate_t * __restrict);
typedef struct {
wchar_t ch;
int want;
wchar_t lbound;
} _UTF8State;
int
_UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
{
l->__mbrtowc = _UTF8_mbrtowc;
l->__wcrtomb = _UTF8_wcrtomb;
l->__mbsinit = _UTF8_mbsinit;
l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
l->__wcsnrtombs = _UTF8_wcsnrtombs;
l->runes = rl;
l->__mb_cur_max = 4;
/*
* UCS-4 encoding used as the internal representation, so
* slots 0x0080-0x00FF are occuped and must be excluded
* from the single byte ctype by setting the limit.
*/
l->__mb_sb_limit = 128;
return (0);
}
static int
_UTF8_mbsinit(const mbstate_t *ps)
{
return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
}
static size_t
_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
mbstate_t * __restrict ps)
{
_UTF8State *us;
int ch, i, mask, want;
wchar_t lbound, wch;
us = (_UTF8State *)ps;
if (us->want < 0 || us->want > 6) {
errno = EINVAL;
return ((size_t)-1);
}
if (s == NULL) {
s = "";
n = 1;
pwc = NULL;
}
if (n == 0)
/* Incomplete multibyte sequence */
return ((size_t)-2);
if (us->want == 0) {
/*
* Determine the number of octets that make up this character
* from the first octet, and a mask that extracts the
* interesting bits of the first octet. We already know
* the character is at least two bytes long.
*
* We also specify a lower bound for the character code to
* detect redundant, non-"shortest form" encodings. For
* example, the sequence C0 80 is _not_ a legal representation
* of the null character. This enforces a 1-to-1 mapping
* between character codes and their multibyte representations.
*/
ch = (unsigned char)*s;
if ((ch & 0x80) == 0) {
/* Fast path for plain ASCII characters. */
if (pwc != NULL)
*pwc = ch;
return (ch != '\0' ? 1 : 0);
}
if ((ch & 0xe0) == 0xc0) {
mask = 0x1f;
want = 2;
lbound = 0x80;
} else if ((ch & 0xf0) == 0xe0) {
mask = 0x0f;
want = 3;
lbound = 0x800;
} else if ((ch & 0xf8) == 0xf0) {
mask = 0x07;
want = 4;
lbound = 0x10000;
} else {
/*
* Malformed input; input is not UTF-8.
*/
errno = EILSEQ;
return ((size_t)-1);
}
} else {
want = us->want;
lbound = us->lbound;
}
/*
* Decode the octet sequence representing the character in chunks
* of 6 bits, most significant first.
*/
if (us->want == 0)
wch = (unsigned char)*s++ & mask;
else
wch = us->ch;
for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
if ((*s & 0xc0) != 0x80) {
/*
* Malformed input; bad characters in the middle
* of a character.
*/
errno = EILSEQ;
return ((size_t)-1);
}
wch <<= 6;
wch |= *s++ & 0x3f;
}
if (i < want) {
/* Incomplete multibyte sequence. */
us->want = want - i;
us->lbound = lbound;
us->ch = wch;
return ((size_t)-2);
}
if (wch < lbound) {
/*
* Malformed input; redundant encoding.
*/
errno = EILSEQ;
return ((size_t)-1);
}
if ((wch >= 0xd800 && wch <= 0xdfff) || wch > 0x10ffff) {
/*
* Malformed input; invalid code points.
*/
errno = EILSEQ;
return ((size_t)-1);
}
if (pwc != NULL)
*pwc = wch;
us->want = 0;
return (wch == L'\0' ? 0 : want);
}
static size_t
_UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
size_t nms, size_t len, mbstate_t * __restrict ps)
{
_UTF8State *us;
const char *s;
size_t nchr;
wchar_t wc;
size_t nb;
us = (_UTF8State *)ps;
s = *src;
nchr = 0;
if (dst == NULL) {
/*
* The fast path in the loop below is not safe if an ASCII
* character appears as anything but the first byte of a
* multibyte sequence. Check now to avoid doing it in the loop.
*/
if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
errno = EILSEQ;
return ((size_t)-1);
}
for (;;) {
if (nms > 0 && (signed char)*s > 0)
/*
* Fast path for plain ASCII characters
* excluding NUL.
*/
nb = 1;
else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
(size_t)-1)
/* Invalid sequence - mbrtowc() sets errno. */
return ((size_t)-1);
else if (nb == 0 || nb == (size_t)-2)
return (nchr);
s += nb;
nms -= nb;
nchr++;
}
/*NOTREACHED*/
}
/*
* The fast path in the loop below is not safe if an ASCII
* character appears as anything but the first byte of a
* multibyte sequence. Check now to avoid doing it in the loop.
*/
if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
errno = EILSEQ;
return ((size_t)-1);
}
while (len-- > 0) {
if (nms > 0 && (signed char)*s > 0) {
/*
* Fast path for plain ASCII characters
* excluding NUL.
*/
*dst = (wchar_t)*s;
nb = 1;
} else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
(size_t)-1) {
*src = s;
return ((size_t)-1);
} else if (nb == (size_t)-2) {
*src = s + nms;
return (nchr);
} else if (nb == 0) {
*src = NULL;
return (nchr);
}
s += nb;
nms -= nb;
nchr++;
dst++;
}
*src = s;
return (nchr);
}
static size_t
_UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
{
_UTF8State *us;
unsigned char lead;
int i, len;
us = (_UTF8State *)ps;
if (us->want != 0) {
errno = EINVAL;
return ((size_t)-1);
}
if (s == NULL)
/* Reset to initial shift state (no-op) */
return (1);
/*
* Determine the number of octets needed to represent this character.
* We always output the shortest sequence possible. Also specify the
* first few bits of the first octet, which contains the information
* about the sequence length.
*/
if ((wc & ~0x7f) == 0) {
/* Fast path for plain ASCII characters. */
*s = (char)wc;
return (1);
} else if ((wc & ~0x7ff) == 0) {
lead = 0xc0;
len = 2;
} else if ((wc & ~0xffff) == 0) {
if (wc >= 0xd800 && wc <= 0xdfff) {
errno = EILSEQ;
return ((size_t)-1);
}
lead = 0xe0;
len = 3;
} else if (wc >= 0 && wc <= 0x10ffff) {
lead = 0xf0;
len = 4;
} else {
errno = EILSEQ;
return ((size_t)-1);
}
/*
* Output the octets representing the character in chunks
* of 6 bits, least significant last. The first octet is
* a special case because it contains the sequence length
* information.
*/
for (i = len - 1; i > 0; i--) {
s[i] = (wc & 0x3f) | 0x80;
wc >>= 6;
}
*s = (wc & 0xff) | lead;
return (len);
}
static size_t
_UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
size_t nwc, size_t len, mbstate_t * __restrict ps)
{
_UTF8State *us;
char buf[MB_LEN_MAX];
const wchar_t *s;
size_t nbytes;
size_t nb;
us = (_UTF8State *)ps;
if (us->want != 0) {
errno = EINVAL;
return ((size_t)-1);
}
s = *src;
nbytes = 0;
if (dst == NULL) {
while (nwc-- > 0) {
if (0 <= *s && *s < 0x80)
/* Fast path for plain ASCII characters. */
nb = 1;
else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
(size_t)-1)
/* Invalid character - wcrtomb() sets errno. */
return ((size_t)-1);
if (*s == L'\0')
return (nbytes + nb - 1);
s++;
nbytes += nb;
}
return (nbytes);
}
while (len > 0 && nwc-- > 0) {
if (0 <= *s && *s < 0x80) {
/* Fast path for plain ASCII characters. */
nb = 1;
*dst = *s;
} else if (len > (size_t)MB_CUR_MAX) {
/* Enough space to translate in-place. */
if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
*src = s;
return ((size_t)-1);
}
} else {
/*
* May not be enough space; use temp. buffer.
*/
if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
*src = s;
return ((size_t)-1);
}
if (nb > (int)len)
/* MB sequence for character won't fit. */
break;
memcpy(dst, buf, nb);
}
if (*s == L'\0') {
*src = NULL;
return (nbytes + nb - 1);
}
s++;
dst += nb;
len -= nb;
nbytes += nb;
}
*src = s;
return (nbytes);
}