diff --git a/lib/libc/locale/Makefile.inc b/lib/libc/locale/Makefile.inc index ffef22862e8f..c2f2f4e61727 100644 --- a/lib/libc/locale/Makefile.inc +++ b/lib/libc/locale/Makefile.inc @@ -4,11 +4,11 @@ # locale sources .PATH: ${.CURDIR}/${LIBC_ARCH}/locale ${.CURDIR}/locale -SRCS+= ascii.c big5.c btowc.c c16rtomb.c c32rtomb.c collate.c collcmp.c euc.c \ - fix_grouping.c gb18030.c gb2312.c gbk.c ctype.c isctype.c iswctype.c \ +SRCS+= ascii.c big5.c btowc.c collate.c collcmp.c euc.c fix_grouping.c \ + gb18030.c gb2312.c gbk.c ctype.c isctype.c iswctype.c \ ldpart.c lmessages.c lmonetary.c lnumeric.c localeconv.c mblen.c \ mbrlen.c \ - mbrtoc16.c mbrtoc32.c mbrtowc.c mbsinit.c mbsnrtowcs.c \ + mbrtowc.c mbsinit.c mbsnrtowcs.c \ mbsrtowcs.c mbtowc.c mbstowcs.c \ mskanji.c nextwctype.c nl_langinfo.c nomacros.c none.c rpmatch.c \ rune.c \ @@ -23,6 +23,12 @@ SRCS+= ascii.c big5.c btowc.c c16rtomb.c c32rtomb.c collate.c collcmp.c euc.c \ wcwidth.c\ xlocale.c +.if ${MK_ICONV} != "no" +SRCS+= c16rtomb_iconv.c c32rtomb_iconv.c mbrtoc16_iconv.c mbrtoc32_iconv.c +.else +SRCS+= c16rtomb.c c32rtomb.c mbrtoc16.c mbrtoc32.c +.endif + SYM_MAPS+=${.CURDIR}/locale/Symbol.map MAN+= btowc.3 \ diff --git a/lib/libc/locale/c16rtomb_iconv.c b/lib/libc/locale/c16rtomb_iconv.c new file mode 100644 index 000000000000..86bd9dab2a52 --- /dev/null +++ b/lib/libc/locale/c16rtomb_iconv.c @@ -0,0 +1,8 @@ +/* $FreeBSD$ */ +#define charXX_t char16_t +#define cXXrtomb c16rtomb +#define cXXrtomb_l c16rtomb_l +#define SRCBUF_LEN 2 +#define UTF_XX_INTERNAL "UTF-16-INTERNAL" + +#include "cXXrtomb_iconv.h" diff --git a/lib/libc/locale/c32rtomb_iconv.c b/lib/libc/locale/c32rtomb_iconv.c new file mode 100644 index 000000000000..dabbfd7f7ab4 --- /dev/null +++ b/lib/libc/locale/c32rtomb_iconv.c @@ -0,0 +1,8 @@ +/* $FreeBSD$ */ +#define charXX_t char32_t +#define cXXrtomb c32rtomb +#define cXXrtomb_l c32rtomb_l +#define SRCBUF_LEN 1 +#define UTF_XX_INTERNAL "UTF-32-INTERNAL" + +#include "cXXrtomb_iconv.h" diff --git a/lib/libc/locale/cXXrtomb_iconv.h b/lib/libc/locale/cXXrtomb_iconv.h new file mode 100644 index 000000000000..d6e7ce0ae3ac --- /dev/null +++ b/lib/libc/locale/cXXrtomb_iconv.h @@ -0,0 +1,115 @@ +/*- + * Copyright (c) 2013 Ed Schouten + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include + +#include "../iconv/citrus_hash.h" +#include "../iconv/citrus_module.h" +#include "../iconv/citrus_iconv.h" +#include "xlocale_private.h" + +typedef struct { + bool initialized; + struct _citrus_iconv iconv; + union { + charXX_t widechar[SRCBUF_LEN]; + char bytes[sizeof(charXX_t) * SRCBUF_LEN]; + } srcbuf; + size_t srcbuf_len; +} _ConversionState; +_Static_assert(sizeof(_ConversionState) <= sizeof(mbstate_t), + "Size of _ConversionState must not exceed mbstate_t's size."); + +size_t +cXXrtomb_l(char * __restrict s, charXX_t c, mbstate_t * __restrict ps, + locale_t locale) +{ + _ConversionState *cs; + struct _citrus_iconv *handle; + char *src, *dst; + size_t srcleft, dstleft, invlen; + int err; + + FIX_LOCALE(locale); + if (ps == NULL) + ps = &locale->cXXrtomb; + cs = (_ConversionState *)ps; + handle = &cs->iconv; + + /* Reinitialize mbstate_t. */ + if (s == NULL || !cs->initialized) { + if (_citrus_iconv_open(&handle, UTF_XX_INTERNAL, + nl_langinfo_l(CODESET, locale)) != 0) { + cs->initialized = false; + errno = EINVAL; + return (-1); + } + handle->cv_shared->ci_discard_ilseq = true; + handle->cv_shared->ci_hooks = NULL; + cs->srcbuf_len = 0; + cs->initialized = true; + if (s == NULL) + return (1); + } + + assert(cs->srcbuf_len < sizeof(cs->srcbuf.widechar) / sizeof(charXX_t)); + cs->srcbuf.widechar[cs->srcbuf_len++] = c; + + /* Perform conversion. */ + src = cs->srcbuf.bytes; + srcleft = cs->srcbuf_len * sizeof(charXX_t); + dst = s; + dstleft = MB_CUR_MAX_L(locale); + err = _citrus_iconv_convert(handle, &src, &srcleft, &dst, &dstleft, + 0, &invlen); + + /* Character is part of a surrogate pair. We need more input. */ + if (err == EINVAL) + return (0); + cs->srcbuf_len = 0; + + /* Illegal sequence. */ + if (dst == s) { + errno = EILSEQ; + return ((size_t)-1); + } + return (dst - s); +} + +size_t +cXXrtomb(char * __restrict s, charXX_t c, mbstate_t * __restrict ps) +{ + + return (cXXrtomb_l(s, c, ps, __get_locale())); +} diff --git a/lib/libc/locale/mbrtoc16_iconv.c b/lib/libc/locale/mbrtoc16_iconv.c new file mode 100644 index 000000000000..f1eaf1925496 --- /dev/null +++ b/lib/libc/locale/mbrtoc16_iconv.c @@ -0,0 +1,8 @@ +/* $FreeBSD$ */ +#define charXX_t char16_t +#define mbrtocXX mbrtoc16 +#define mbrtocXX_l mbrtoc16_l +#define DSTBUF_LEN 2 +#define UTF_XX_INTERNAL "UTF-16-INTERNAL" + +#include "mbrtocXX_iconv.h" diff --git a/lib/libc/locale/mbrtoc32_iconv.c b/lib/libc/locale/mbrtoc32_iconv.c new file mode 100644 index 000000000000..ec2c0145d9d6 --- /dev/null +++ b/lib/libc/locale/mbrtoc32_iconv.c @@ -0,0 +1,8 @@ +/* $FreeBSD$ */ +#define charXX_t char32_t +#define mbrtocXX mbrtoc32 +#define mbrtocXX_l mbrtoc32_l +#define DSTBUF_LEN 1 +#define UTF_XX_INTERNAL "UTF-32-INTERNAL" + +#include "mbrtocXX_iconv.h" diff --git a/lib/libc/locale/mbrtocXX_iconv.h b/lib/libc/locale/mbrtocXX_iconv.h new file mode 100644 index 000000000000..9eb6f6831eb6 --- /dev/null +++ b/lib/libc/locale/mbrtocXX_iconv.h @@ -0,0 +1,158 @@ +/*- + * Copyright (c) 2013 Ed Schouten + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include + +#include "../iconv/citrus_hash.h" +#include "../iconv/citrus_module.h" +#include "../iconv/citrus_iconv.h" +#include "xlocale_private.h" + +typedef struct { + bool initialized; + struct _citrus_iconv iconv; + char srcbuf[MB_LEN_MAX]; + size_t srcbuf_len; + union { + charXX_t widechar[DSTBUF_LEN]; + char bytes[sizeof(charXX_t) * DSTBUF_LEN]; + } dstbuf; + size_t dstbuf_len; +} _ConversionState; +_Static_assert(sizeof(_ConversionState) <= sizeof(mbstate_t), + "Size of _ConversionState must not exceed mbstate_t's size."); + +size_t +mbrtocXX_l(charXX_t * __restrict pc, const char * __restrict s, size_t n, + mbstate_t * __restrict ps, locale_t locale) +{ + _ConversionState *cs; + struct _citrus_iconv *handle; + size_t i, retval; + charXX_t retchar; + + FIX_LOCALE(locale); + if (ps == NULL) + ps = &locale->mbrtocXX; + cs = (_ConversionState *)ps; + handle = &cs->iconv; + + /* Reinitialize mbstate_t. */ + if (s == NULL || !cs->initialized) { + if (_citrus_iconv_open(&handle, + nl_langinfo_l(CODESET, locale), UTF_XX_INTERNAL) != 0) { + cs->initialized = false; + errno = EINVAL; + return (-1); + } + handle->cv_shared->ci_discard_ilseq = true; + handle->cv_shared->ci_hooks = NULL; + cs->srcbuf_len = cs->dstbuf_len = 0; + cs->initialized = true; + if (s == NULL) + return (0); + } + + /* See if we still have characters left from the previous invocation. */ + if (cs->dstbuf_len > 0) { + retval = (size_t)-3; + goto return_char; + } + + /* Fill up the read buffer as far as possible. */ + if (n > sizeof(cs->srcbuf) - cs->srcbuf_len) + n = sizeof(cs->srcbuf) - cs->srcbuf_len; + memcpy(cs->srcbuf + cs->srcbuf_len, s, n); + + /* Convert as few characters to the dst buffer as possible. */ + for (i = 0; ; i++) { + char *src, *dst; + size_t srcleft, dstleft, invlen; + int err; + + src = cs->srcbuf; + srcleft = cs->srcbuf_len + n; + dst = cs->dstbuf.bytes; + dstleft = i * sizeof(charXX_t); + assert(srcleft <= sizeof(cs->srcbuf) && + dstleft <= sizeof(cs->dstbuf.bytes)); + err = _citrus_iconv_convert(handle, &src, &srcleft, + &dst, &dstleft, 0, &invlen); + cs->dstbuf_len = (dst - cs->dstbuf.bytes) / sizeof(charXX_t); + + /* Got new character(s). Return the first. */ + if (cs->dstbuf_len > 0) { + assert(src - cs->srcbuf > cs->srcbuf_len); + retval = src - cs->srcbuf - cs->srcbuf_len; + cs->srcbuf_len = 0; + goto return_char; + } + + /* Increase dst buffer size, to obtain the surrogate pair. */ + if (err == E2BIG) + continue; + + /* Illegal sequence. */ + if (invlen > 0) { + cs->srcbuf_len = 0; + errno = EILSEQ; + return ((size_t)-1); + } + + /* Save unprocessed remainder for the next invocation. */ + memmove(cs->srcbuf, src, srcleft); + cs->srcbuf_len = srcleft; + return ((size_t)-2); + } + +return_char: + retchar = cs->dstbuf.widechar[0]; + memmove(&cs->dstbuf.widechar[0], &cs->dstbuf.widechar[1], + --cs->dstbuf_len * sizeof(charXX_t)); + if (pc != NULL) + *pc = retchar; + if (retchar == 0) + return (0); + return (retval); +} + +size_t +mbrtocXX(charXX_t * __restrict pc, const char * __restrict s, size_t n, + mbstate_t * __restrict ps) +{ + + return (mbrtocXX_l(pc, s, n, ps, __get_locale())); +} diff --git a/tools/regression/lib/libc/locale/test-c16rtomb.c b/tools/regression/lib/libc/locale/test-c16rtomb.c index eb889468e8c4..2c188fa337b8 100644 --- a/tools/regression/lib/libc/locale/test-c16rtomb.c +++ b/tools/regression/lib/libc/locale/test-c16rtomb.c @@ -82,6 +82,34 @@ main(int argc, char *argv[]) assert(c16rtomb(buf, 0xd83d, &s) == 0); assert(c16rtomb(buf, 0xdca9, &s) == (size_t)-1); assert(errno == EILSEQ); + assert((unsigned char)buf[0] == 0xcc); + + /* + * ISO8859-1. + */ + + assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-1"), + "en_US.ISO8859-1") == 0); + + /* Unicode character 'Euro sign'. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + assert(c16rtomb(buf, 0x20ac, &s) == (size_t)-1); + assert(errno == EILSEQ); + assert((unsigned char)buf[0] == 0xcc); + + /* + * ISO8859-15. + */ + + assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-15"), + "en_US.ISO8859-15") == 0); + + /* Unicode character 'Euro sign'. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + assert(c16rtomb(buf, 0x20ac, &s) == 1); + assert((unsigned char)buf[0] == 0xa4 && (unsigned char)buf[1] == 0xcc); /* * UTF-8. @@ -104,12 +132,14 @@ main(int argc, char *argv[]) assert(c16rtomb(buf, 0xd83d, &s) == 0); assert(c16rtomb(buf, L'A', &s) == (size_t)-1); assert(errno == EILSEQ); + assert((unsigned char)buf[0] == 0xcc); /* Invalid code; 'Pile of poo' without the lead surrogate. */ memset(&s, 0, sizeof(s)); memset(buf, 0xcc, sizeof(buf)); assert(c16rtomb(buf, 0xdca9, &s) == (size_t)-1); assert(errno == EILSEQ); + assert((unsigned char)buf[0] == 0xcc); printf("ok 1 - c16rtomb()\n"); } diff --git a/tools/regression/lib/libc/locale/test-mbrtoc16.c b/tools/regression/lib/libc/locale/test-mbrtoc16.c index 88e8091d43ac..f709a9c80323 100644 --- a/tools/regression/lib/libc/locale/test-mbrtoc16.c +++ b/tools/regression/lib/libc/locale/test-mbrtoc16.c @@ -85,6 +85,37 @@ main(int argc, char *argv[]) assert(mbrtoc16(&c16, "", 0, &s) == (size_t)-2); assert(c16 == L'z'); + /* Check that mbrtoc16() doesn't read ahead too aggressively. */ + memset(&s, 0, sizeof(s)); + assert(mbrtoc16(&c16, "AB", 2, &s) == 1); + assert(c16 == L'A'); + assert(mbrtoc16(&c16, "C", 1, &s) == 1); + assert(c16 == L'C'); + + /* + * ISO-8859-1. + */ + + assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-1"), + "en_US.ISO8859-1") == 0); + + /* Currency sign. */ + memset(&s, 0, sizeof(s)); + assert(mbrtoc16(&c16, "\xa4", 1, &s) == 1); + assert(c16 == 0xa4); + + /* + * ISO-8859-15. + */ + + assert(strcmp(setlocale(LC_CTYPE, "en_US.ISO8859-15"), + "en_US.ISO8859-15") == 0); + + /* Euro sign. */ + memset(&s, 0, sizeof(s)); + assert(mbrtoc16(&c16, "\xa4", 1, &s) == 1); + assert(c16 == 0x20ac); + /* * UTF-8. */ @@ -144,6 +175,20 @@ main(int argc, char *argv[]) assert(mbrtoc16(&c16, "", 0, &s) == (size_t)-3); assert(c16 == 0xdca9); + /* Letter e with acute, precomposed. */ + memset(&s, 0, sizeof(s)); + c16 = 0; + assert(mbrtoc16(&c16, "\xc3\xa9", 2, &s) == 2); + assert(c16 == 0xe9); + + /* Letter e with acute, combined. */ + memset(&s, 0, sizeof(s)); + c16 = 0; + assert(mbrtoc16(&c16, "\x65\xcc\x81", 3, &s) == 1); + assert(c16 == 0x65); + assert(mbrtoc16(&c16, "\xcc\x81", 2, &s) == 2); + assert(c16 == 0x301); + printf("ok 1 - mbrtoc16()\n"); return (0);