freebsd-dev/sys/libkern/iconv_ucs.c

/*-
 * Copyright (c) 2003, 2005 Ryuichiro Imura
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/iconv.h>

#include "iconv_converter_if.h"

/*
 * "UCS" converter
 */

#define	KICONV_UCS_COMBINE	0x1
#define	KICONV_UCS_FROM_UTF8	0x2
#define	KICONV_UCS_TO_UTF8	0x4
#define	KICONV_UCS_FROM_LE	0x8
#define	KICONV_UCS_TO_LE	0x10
#define	KICONV_UCS_FROM_UTF16	0x20
#define	KICONV_UCS_TO_UTF16	0x40
#define	KICONV_UCS_UCS4		0x80

#define	ENCODING_UTF16	"UTF-16BE"
#define	ENCODING_UTF8	"UTF-8"

static struct {
	const char *name;
	int from_flag, to_flag;
} unicode_family[] = {
	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
	{ NULL,		0,	0 }
};

static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
static uint32_t encode_surrogate(uint32_t code);
static uint32_t decode_surrogate(const u_char *ucs);

#ifdef MODULE_DEPEND
MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
#endif

/*
 * UCS converter instance
 */
struct iconv_ucs {
	KOBJ_FIELDS;
	int			convtype;
	struct iconv_cspair *	d_csp;
	struct iconv_cspair *	d_cspf;
	void *			f_ctp;
	void *			t_ctp;
	void *			ctype;
};

static int
iconv_ucs_open(struct iconv_converter_class *dcp,
	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
{
	struct iconv_ucs *dp;
	int i;
	const char *from, *to;

	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
	to = csp->cp_to;
	from = cspf ? cspf->cp_from : csp->cp_from;

	dp->convtype = 0;

	if (cspf)
		dp->convtype |= KICONV_UCS_COMBINE;
	for (i = 0; unicode_family[i].name; i++) {
		if (strcmp(from, unicode_family[i].name) == 0)
			dp->convtype |= unicode_family[i].from_flag;
		if (strcmp(to, unicode_family[i].name) == 0)
			dp->convtype |= unicode_family[i].to_flag;
	}
	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
		dp->convtype |= KICONV_UCS_UCS4;
	else
		dp->convtype &= ~KICONV_UCS_UCS4;

	dp->f_ctp = dp->t_ctp = NULL;
	if (dp->convtype & KICONV_UCS_COMBINE) {
		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
		}
		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
		}
	}

	dp->ctype = NULL;
	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);

	dp->d_csp = csp;
	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
		if (cspf) {
			dp->d_cspf = cspf;
			cspf->cp_refcount++;
		} else
			csp->cp_refcount++;
	}
	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
		csp->cp_refcount++;
	*dpp = (void*)dp;
	return 0;
}

static int
iconv_ucs_close(void *data)
{
	struct iconv_ucs *dp = data;

	if (dp->f_ctp)
		iconv_close(dp->f_ctp);
	if (dp->t_ctp)
		iconv_close(dp->t_ctp);
	if (dp->ctype)
		iconv_close(dp->ctype);
	if (dp->d_cspf)
		dp->d_cspf->cp_refcount--;
	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
		dp->d_csp->cp_refcount--;
	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
		dp->d_csp->cp_refcount--;
	kobj_delete((struct kobj*)data, M_ICONV);
	return 0;
}

static int
iconv_ucs_conv(void *d2p, const char **inbuf,
	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
	int convchar, int casetype)
{
	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
	int ret = 0, i;
	size_t in, on, ir, or, inlen, outlen, ucslen;
	const char *src, *p;
	char *dst;
	u_char ucs[4], *q;
	uint32_t code;

	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
		return 0;
	ir = in = *inbytesleft;
	or = on = *outbytesleft;
	src = *inbuf;
	dst = *outbuf;

	while (ir > 0 && or > 0) {

		/*
		 * The first half of conversion.
		 * (convert any code into ENCODING_UNICODE)
		 */
		code = 0;
		p = src;
		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
			/* convert UTF-8 to ENCODING_UNICODE */
			inlen = 0;
			code = utf8_to_ucs4(p, &inlen, ir);
			if (code == 0) {
				ret = -1;
				break;
			}

			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
				code = towlower(code, dp->ctype);
			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
				code = towupper(code, dp->ctype);
			}

			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
				/* reserved for utf-16 surrogate pair */
				/* invalid unicode */
				ret = -1;
				break;
			}

			if (inlen == 4) {
				if (dp->convtype & KICONV_UCS_UCS4) {
					ucslen = 4;
					code = encode_surrogate(code);
				} else {
					/* can't handle with ucs-2 */
					ret = -1;
					break;
				}
			} else {
				ucslen = 2;
			}

			/* save UCS-4 into ucs[] */
			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
				*q++ = (code >> (i << 3)) & 0xff;

		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
			/* convert local code to ENCODING_UNICODE */
			ucslen = 4;
			inlen = ir;
			q = ucs;
			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
			if (ret)
				break;
			inlen = ir - inlen;
			ucslen = 4 - ucslen;

		} else {
			/* src code is a proper subset of ENCODING_UNICODE */
			q = ucs;
			if (dp->convtype & KICONV_UCS_FROM_LE) {
				*q = *(p + 1);
				*(q + 1) = *p;
				p += 2;
			} else {
				*q = *p++;
				*(q + 1) = *p++;
			}
			if ((*q & 0xfc) == 0xd8) {
				if (dp->convtype & KICONV_UCS_UCS4 &&
				    dp->convtype & KICONV_UCS_FROM_UTF16) {
					inlen = ucslen = 4;
				} else {
					/* invalid unicode */
					ret = -1;
					break;
				}
			} else {
				inlen = ucslen = 2;
			}
			if (ir < inlen) {
				ret = -1;
				break;
			}
			if (ucslen == 4) {
				q += 2;
				if (dp->convtype & KICONV_UCS_FROM_LE) {
					*q = *(p + 1);
					*(q + 1) = *p;
				} else {
					*q = *p++;
					*(q + 1) = *p;
				}
				if ((*q & 0xfc) != 0xdc) {
					/* invalid unicode */
					ret = -1;
					break;
				}
			}
		}

		/*
		 * The second half of conversion.
		 * (convert ENCODING_UNICODE into any code)
		 */
		p = ucs;
		if (dp->convtype & KICONV_UCS_TO_UTF8) {
			q = (u_char *)dst;
			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
				/* decode surrogate pair */
				code = decode_surrogate(p);
			} else {
				code = (ucs[0] << 8) | ucs[1];
			}

			if (casetype == KICONV_LOWER && dp->ctype) {
				code = towlower(code, dp->ctype);
			} else if (casetype == KICONV_UPPER && dp->ctype) {
				code = towupper(code, dp->ctype);
			}

			outlen = 0;
			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
				ret = -1;
				break;
			}

			src += inlen;
			ir -= inlen;
			dst += outlen;
			or -= outlen;

		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
			if (ret)
				break;

			src += inlen;
			ir -= inlen;

		} else {
			/* dst code is a proper subset of ENCODING_UNICODE */
			if (or < ucslen) {
				ret = -1;
				break;
			}
			src += inlen;
			ir -= inlen;
			or -= ucslen;
			if (dp->convtype & KICONV_UCS_TO_LE) {
				*dst++ = *(p + 1);
				*dst++ = *p;
				p += 2;
			} else {
				*dst++ = *p++;
				*dst++ = *p++;
			}
			if (ucslen == 4) {
				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
					ret = -1;
					break;
				}
				if (dp->convtype & KICONV_UCS_TO_LE) {
					*dst++ = *(p + 1);
					*dst++ = *p;
				} else {
					*dst++ = *p++;
					*dst++ = *p;
				}
			}
		}

		if (convchar == 1)
			break;
	}

	*inbuf += in - ir;
	*outbuf += on - or;
	*inbytesleft -= in - ir;
	*outbytesleft -= on - or;
	return (ret);
}

static int
iconv_ucs_init(struct iconv_converter_class *dcp)
{
	int error;

	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
	if (error)
		return (error);
	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
	if (error)
		return (error);
	return (0);
}

static int
iconv_ucs_done(struct iconv_converter_class *dcp)
{
	return (0);
}

static const char *
iconv_ucs_name(struct iconv_converter_class *dcp)
{
	return (ENCODING_UNICODE);
}

static kobj_method_t iconv_ucs_methods[] = {
	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
	{0, 0}
};

KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));

static uint32_t
utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
{
	size_t i, w = 0;
	uint32_t ucs4 = 0;

	/*
	 * get leading 1 byte from utf-8
	 */
	if ((*src & 0x80) == 0) {
		/*
		 * leading 1 bit is "0"
		 *  utf-8: 0xxxxxxx
		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
		 */
		w = 1;
		/* get trailing 7 bits */
		ucs4 = *src & 0x7f;
	} else if ((*src & 0xe0) == 0xc0) {
		/*
		 * leading 3 bits are "110"
		 *  utf-8: 110xxxxx 10yyyyyy
		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
		 */
		w = 2;
		/* get trailing 5 bits */
		ucs4 = *src & 0x1f;
	} else if ((*src & 0xf0) == 0xe0) {
		/*
		 * leading 4 bits are "1110"
		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
		 */
		w = 3;
		/* get trailing 4 bits */
		ucs4 = *src & 0x0f;
	} else if ((*src & 0xf8) == 0xf0) {
		/*
		 * leading 5 bits are "11110"
		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
		 */
		w = 4;
		/* get trailing 3 bits */
		ucs4 = *src & 0x07;
	} else {
		/* out of utf-16 range or having illegal bits */
		return (0);
	}

	if (srclen < w)
		return (0);

	/*
	 * get left parts from utf-8
	 */
	for (i = 1 ; i < w ; i++) {
		if ((*(src + i) & 0xc0) != 0x80) {
			/* invalid: leading 2 bits are not "10" */
			return (0);
		}
		/* concatenate trailing 6 bits into ucs4 */
		ucs4 <<= 6;
		ucs4 |= *(src + i) & 0x3f;
	}

	*utf8width = w;
	return (ucs4);
}

static u_char *
ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
{
	u_char lead, *p;
	size_t i, w;

	/*
	 * determine utf-8 width and leading bits
	 */
	if (ucs4 < 0x80) {
		w = 1;
		lead = 0;	/* "0" */
	} else if (ucs4 < 0x800) {
		w = 2;
		lead = 0xc0;	/* "11" */
	} else if (ucs4 < 0x10000) {
		w = 3;
		lead = 0xe0;	/* "111" */
	} else if (ucs4 < 0x200000) {
		w = 4;
		lead = 0xf0;	/* "1111" */
	} else {
		return (NULL);
	}

	if (dstlen < w)
		return (NULL);

	/*
	 * construct utf-8
	 */
	p = dst;
	for (i = w - 1 ; i >= 1 ; i--) {
		/* get trailing 6 bits and put it with leading bit as "1" */
		*(p + i) = (ucs4 & 0x3f) | 0x80;
		ucs4 >>= 6;
	}
	*p = ucs4 | lead;

	*utf8width = w;

	return (p);
}

static uint32_t
encode_surrogate(register uint32_t code)
{
	return ((((code - 0x10000) << 6) & 0x3ff0000) |
	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
}

static uint32_t
decode_surrogate(register const u_char *ucs)
{
	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
}
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`/*-`
			`* Copyright (c) 2003, 2005 Ryuichiro Imura`
			`* All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* 1. Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* 2. Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
			`* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE`
			`* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL`
			`* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS`
			`* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)`
			`* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT`
			`* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY`
			`* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF`
			`* SUCH DAMAGE.`
			`*/`

			`#include <sys/cdefs.h>`
			`__FBSDID("$FreeBSD$");`

			`#include <sys/param.h>`
			`#include <sys/kernel.h>`
			`#include <sys/systm.h>`
			`#include <sys/malloc.h>`
			`#include <sys/iconv.h>`

			`#include "iconv_converter_if.h"`

			`/*`
			`* "UCS" converter`
			`*/`

			`#define KICONV_UCS_COMBINE 0x1`
			`#define KICONV_UCS_FROM_UTF8 0x2`
			`#define KICONV_UCS_TO_UTF8 0x4`
			`#define KICONV_UCS_FROM_LE 0x8`
			`#define KICONV_UCS_TO_LE 0x10`
			`#define KICONV_UCS_FROM_UTF16 0x20`
			`#define KICONV_UCS_TO_UTF16 0x40`
			`#define KICONV_UCS_UCS4 0x80`

			`#define ENCODING_UTF16 "UTF-16BE"`
			`#define ENCODING_UTF8 "UTF-8"`

			`static struct {`
			`const char *name;`
			`int from_flag, to_flag;`
			`} unicode_family[] = {`
			`{ "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },`
			`{ "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },`
			`{ "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },`
			`{ "UTF-16LE", KICONV_UCS_FROM_UTF16\|KICONV_UCS_FROM_LE,`
			`KICONV_UCS_TO_UTF16\|KICONV_UCS_TO_LE },`
			`{ NULL, 0, 0 }`
			`};`

			`static uint32_t utf8_to_ucs4(const char src, size_t utf8width, size_t srclen);`
			`static u_char ucs4_to_utf8(uint32_t ucs4, char dst, size_t *utf8width, size_t dstlen);`
			`static uint32_t encode_surrogate(uint32_t code);`
			`static uint32_t decode_surrogate(const u_char *ucs);`

			`#ifdef MODULE_DEPEND`
			`MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);`
			`#endif`

			`/*`
			`* UCS converter instance`
			`*/`
			`struct iconv_ucs {`
			`KOBJ_FIELDS;`
			`int convtype;`
			`struct iconv_cspair * d_csp;`
			`struct iconv_cspair * d_cspf;`
			`void * f_ctp;`
			`void * t_ctp;`
			`void * ctype;`
			`};`

			`static int`
			`iconv_ucs_open(struct iconv_converter_class *dcp,`
			`struct iconv_cspair csp, struct iconv_cspair cspf, void **dpp)`
			`{`
			`struct iconv_ucs *dp;`
			`int i;`
			`const char from, to;`

			`dp = (struct iconv_ucs )kobj_create((struct kobj_class)dcp, M_ICONV, M_WAITOK);`
			`to = csp->cp_to;`
			`from = cspf ? cspf->cp_from : csp->cp_from;`

			`dp->convtype = 0;`

			`if (cspf)`
			`dp->convtype \|= KICONV_UCS_COMBINE;`
			`for (i = 0; unicode_family[i].name; i++) {`
Use strcmp that I replaced by accident. 2012-05-21 02:45:47 +00:00			`if (strcmp(from, unicode_family[i].name) == 0)`
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`dp->convtype \|= unicode_family[i].from_flag;`
Use strcmp that I replaced by accident. 2012-05-21 02:45:47 +00:00			`if (strcmp(to, unicode_family[i].name) == 0)`
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`dp->convtype \|= unicode_family[i].to_flag;`
			`}`
Use strcmp that I replaced by accident. 2012-05-21 02:45:47 +00:00			`if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)`
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`dp->convtype \|= KICONV_UCS_UCS4;`
			`else`
			`dp->convtype &= ~KICONV_UCS_UCS4;`

			`dp->f_ctp = dp->t_ctp = NULL;`
			`if (dp->convtype & KICONV_UCS_COMBINE) {`
			`if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&`
			`(dp->convtype & KICONV_UCS_FROM_LE) == 0) {`
			`iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);`
			`}`
			`if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&`
			`(dp->convtype & KICONV_UCS_TO_LE) == 0) {`
			`iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);`
			`}`
			`}`

			`dp->ctype = NULL;`
			`if (dp->convtype & (KICONV_UCS_FROM_UTF8 \| KICONV_UCS_TO_UTF8))`
			`iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);`

			`dp->d_csp = csp;`
			`if (dp->convtype & (KICONV_UCS_FROM_UTF8 \| KICONV_UCS_FROM_LE)) {`
			`if (cspf) {`
			`dp->d_cspf = cspf;`
			`cspf->cp_refcount++;`
			`} else`
			`csp->cp_refcount++;`
			`}`
			`if (dp->convtype & (KICONV_UCS_TO_UTF8 \| KICONV_UCS_TO_LE))`
			`csp->cp_refcount++;`
			`dpp = (void)dp;`
			`return 0;`
			`}`

			`static int`
			`iconv_ucs_close(void *data)`
			`{`
			`struct iconv_ucs *dp = data;`

			`if (dp->f_ctp)`
			`iconv_close(dp->f_ctp);`
			`if (dp->t_ctp)`
			`iconv_close(dp->t_ctp);`
			`if (dp->ctype)`
			`iconv_close(dp->ctype);`
			`if (dp->d_cspf)`
			`dp->d_cspf->cp_refcount--;`
			`else if (dp->convtype & (KICONV_UCS_FROM_UTF8 \| KICONV_UCS_FROM_LE))`
			`dp->d_csp->cp_refcount--;`
			`if (dp->convtype & (KICONV_UCS_TO_UTF8 \| KICONV_UCS_TO_LE))`
			`dp->d_csp->cp_refcount--;`
			`kobj_delete((struct kobj*)data, M_ICONV);`
			`return 0;`
			`}`

			`static int`
			`iconv_ucs_conv(void d2p, const char *inbuf,`
			`size_t inbytesleft, char outbuf, size_t outbytesleft,`
			`int convchar, int casetype)`
			`{`
			`struct iconv_ucs dp = (struct iconv_ucs)d2p;`
			`int ret = 0, i;`
			`size_t in, on, ir, or, inlen, outlen, ucslen;`
			`const char src, p;`
			`char *dst;`
			`u_char ucs[4], *q;`
			`uint32_t code;`

			`if (inbuf == NULL \|\| inbuf == NULL \|\| outbuf == NULL \|\| outbuf == NULL)`
			`return 0;`
			`ir = in = *inbytesleft;`
			`or = on = *outbytesleft;`
			`src = *inbuf;`
			`dst = *outbuf;`

			`while (ir > 0 && or > 0) {`

			`/*`
			`* The first half of conversion.`
			`* (convert any code into ENCODING_UNICODE)`
			`*/`
			`code = 0;`
			`p = src;`
			`if (dp->convtype & KICONV_UCS_FROM_UTF8) {`
			`/* convert UTF-8 to ENCODING_UNICODE */`
			`inlen = 0;`
			`code = utf8_to_ucs4(p, &inlen, ir);`
			`if (code == 0) {`
			`ret = -1;`
			`break;`
			`}`

			`if (casetype == KICONV_FROM_LOWER && dp->ctype) {`
			`code = towlower(code, dp->ctype);`
			`} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {`
			`code = towupper(code, dp->ctype);`
			`}`

			`if ((code >= 0xd800 && code < 0xe000) \|\| code >= 0x110000 ) {`
			`/* reserved for utf-16 surrogate pair */`
			`/* invalid unicode */`
			`ret = -1;`
			`break;`
			`}`

			`if (inlen == 4) {`
			`if (dp->convtype & KICONV_UCS_UCS4) {`
			`ucslen = 4;`
			`code = encode_surrogate(code);`
			`} else {`
			`/* can't handle with ucs-2 */`
			`ret = -1;`
			`break;`
			`}`
			`} else {`
			`ucslen = 2;`
			`}`

			`/* save UCS-4 into ucs[] */`
			`for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)`
			`*q++ = (code >> (i << 3)) & 0xff;`

			`} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {`
			`/* convert local code to ENCODING_UNICODE */`
			`ucslen = 4;`
			`inlen = ir;`
			`q = ucs;`
			`ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,`
			`&ucslen, casetype & (KICONV_FROM_LOWER \| KICONV_FROM_UPPER));`
			`if (ret)`
			`break;`
			`inlen = ir - inlen;`
			`ucslen = 4 - ucslen;`

			`} else {`
			`/* src code is a proper subset of ENCODING_UNICODE */`
			`q = ucs;`
			`if (dp->convtype & KICONV_UCS_FROM_LE) {`
			`q = (p + 1);`
			`(q + 1) = p;`
			`p += 2;`
			`} else {`
			`q = p++;`
			`(q + 1) = p++;`
			`}`
			`if ((*q & 0xfc) == 0xd8) {`
			`if (dp->convtype & KICONV_UCS_UCS4 &&`
			`dp->convtype & KICONV_UCS_FROM_UTF16) {`
			`inlen = ucslen = 4;`
			`} else {`
			`/* invalid unicode */`
			`ret = -1;`
			`break;`
			`}`
			`} else {`
			`inlen = ucslen = 2;`
			`}`
			`if (ir < inlen) {`
			`ret = -1;`
			`break;`
			`}`
			`if (ucslen == 4) {`
			`q += 2;`
			`if (dp->convtype & KICONV_UCS_FROM_LE) {`
			`q = (p + 1);`
			`(q + 1) = p;`
			`} else {`
			`q = p++;`
			`(q + 1) = p;`
			`}`
			`if ((*q & 0xfc) != 0xdc) {`
			`/* invalid unicode */`
			`ret = -1;`
			`break;`
			`}`
			`}`
			`}`

			`/*`
			`* The second half of conversion.`
			`* (convert ENCODING_UNICODE into any code)`
			`*/`
			`p = ucs;`
			`if (dp->convtype & KICONV_UCS_TO_UTF8) {`
			`q = (u_char *)dst;`
			`if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {`
			`/* decode surrogate pair */`
			`code = decode_surrogate(p);`
			`} else {`
			`code = (ucs[0] << 8) \| ucs[1];`
			`}`

			`if (casetype == KICONV_LOWER && dp->ctype) {`
			`code = towlower(code, dp->ctype);`
			`} else if (casetype == KICONV_UPPER && dp->ctype) {`
			`code = towupper(code, dp->ctype);`
			`}`

			`outlen = 0;`
			`if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {`
			`ret = -1;`
			`break;`
			`}`

			`src += inlen;`
			`ir -= inlen;`
			`dst += outlen;`
			`or -= outlen;`

			`} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {`
			`ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,`
			`&or, casetype & (KICONV_LOWER \| KICONV_UPPER));`
			`if (ret)`
			`break;`

			`src += inlen;`
			`ir -= inlen;`

			`} else {`
			`/* dst code is a proper subset of ENCODING_UNICODE */`
			`if (or < ucslen) {`
			`ret = -1;`
			`break;`
			`}`
			`src += inlen;`
			`ir -= inlen;`
			`or -= ucslen;`
			`if (dp->convtype & KICONV_UCS_TO_LE) {`
			`dst++ = (p + 1);`
			`dst++ = p;`
			`p += 2;`
			`} else {`
			`dst++ = p++;`
			`dst++ = p++;`
			`}`
			`if (ucslen == 4) {`
			`if ((dp->convtype & KICONV_UCS_UCS4) == 0 \|\|`
			`(dp->convtype & KICONV_UCS_TO_UTF16) == 0) {`
			`ret = -1;`
			`break;`
			`}`
			`if (dp->convtype & KICONV_UCS_TO_LE) {`
			`dst++ = (p + 1);`
			`dst++ = p;`
			`} else {`
			`dst++ = p++;`
			`dst++ = p;`
			`}`
			`}`
			`}`

			`if (convchar == 1)`
			`break;`
			`}`

			`*inbuf += in - ir;`
			`*outbuf += on - or;`
			`*inbytesleft -= in - ir;`
			`*outbytesleft -= on - or;`
			`return (ret);`
			`}`

			`static int`
			`iconv_ucs_init(struct iconv_converter_class *dcp)`
			`{`
			`int error;`

			`error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);`
			`if (error)`
			`return (error);`
			`error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);`
			`if (error)`
			`return (error);`
			`return (0);`
			`}`

			`static int`
			`iconv_ucs_done(struct iconv_converter_class *dcp)`
			`{`
			`return (0);`
			`}`

			`static const char *`
			`iconv_ucs_name(struct iconv_converter_class *dcp)`
			`{`
			`return (ENCODING_UNICODE);`
			`}`

			`static kobj_method_t iconv_ucs_methods[] = {`
			`KOBJMETHOD(iconv_converter_open, iconv_ucs_open),`
			`KOBJMETHOD(iconv_converter_close, iconv_ucs_close),`
			`KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),`
			`KOBJMETHOD(iconv_converter_init, iconv_ucs_init),`
			`KOBJMETHOD(iconv_converter_done, iconv_ucs_done),`
			`KOBJMETHOD(iconv_converter_name, iconv_ucs_name),`
			`{0, 0}`
			`};`

			`KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));`

			`static uint32_t`
			`utf8_to_ucs4(const char src, size_t utf8width, size_t srclen)`
			`{`
			`size_t i, w = 0;`
			`uint32_t ucs4 = 0;`

			`/*`
			`* get leading 1 byte from utf-8`
			`*/`
			`if ((*src & 0x80) == 0) {`
			`/*`
			`* leading 1 bit is "0"`
			`* utf-8: 0xxxxxxx`
			`* ucs-4: 00000000 00000000 00000000 0xxxxxxx`
			`*/`
			`w = 1;`
			`/* get trailing 7 bits */`
			`ucs4 = *src & 0x7f;`
			`} else if ((*src & 0xe0) == 0xc0) {`
			`/*`
			`* leading 3 bits are "110"`
			`* utf-8: 110xxxxx 10yyyyyy`
			`* ucs-4: 00000000 00000000 00000xxx xxyyyyyy`
			`*/`
			`w = 2;`
			`/* get trailing 5 bits */`
			`ucs4 = *src & 0x1f;`
			`} else if ((*src & 0xf0) == 0xe0) {`
			`/*`
			`* leading 4 bits are "1110"`
			`* utf-8: 1110xxxx 10yyyyyy 10zzzzzz`
			`* ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz`
			`*/`
			`w = 3;`
			`/* get trailing 4 bits */`
			`ucs4 = *src & 0x0f;`
			`} else if ((*src & 0xf8) == 0xf0) {`
			`/*`
			`* leading 5 bits are "11110"`
			`* utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz`
			`* ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz`
			`*/`
			`w = 4;`
			`/* get trailing 3 bits */`
			`ucs4 = *src & 0x07;`
			`} else {`
			`/* out of utf-16 range or having illegal bits */`
			`return (0);`
			`}`

			`if (srclen < w)`
			`return (0);`

			`/*`
			`* get left parts from utf-8`
			`*/`
			`for (i = 1 ; i < w ; i++) {`
			`if ((*(src + i) & 0xc0) != 0x80) {`
			`/* invalid: leading 2 bits are not "10" */`
			`return (0);`
			`}`
			`/* concatenate trailing 6 bits into ucs4 */`
			`ucs4 <<= 6;`
			`ucs4 \|= *(src + i) & 0x3f;`
			`}`

			`*utf8width = w;`
			`return (ucs4);`
			`}`

			`static u_char *`
			`ucs4_to_utf8(uint32_t ucs4, char dst, size_t utf8width, size_t dstlen)`
			`{`
			`u_char lead, *p;`
			`size_t i, w;`

			`/*`
			`* determine utf-8 width and leading bits`
			`*/`
			`if (ucs4 < 0x80) {`
			`w = 1;`
			`lead = 0; /* "0" */`
			`} else if (ucs4 < 0x800) {`
			`w = 2;`
			`lead = 0xc0; /* "11" */`
			`} else if (ucs4 < 0x10000) {`
			`w = 3;`
			`lead = 0xe0; /* "111" */`
			`} else if (ucs4 < 0x200000) {`
			`w = 4;`
			`lead = 0xf0; /* "1111" */`
			`} else {`
			`return (NULL);`
			`}`

			`if (dstlen < w)`
			`return (NULL);`

			`/*`
			`* construct utf-8`
			`*/`
			`p = dst;`
			`for (i = w - 1 ; i >= 1 ; i--) {`
			`/* get trailing 6 bits and put it with leading bit as "1" */`
			`*(p + i) = (ucs4 & 0x3f) \| 0x80;`
			`ucs4 >>= 6;`
			`}`
			`*p = ucs4 \| lead;`

			`*utf8width = w;`

			`return (p);`
			`}`

			`static uint32_t`
			`encode_surrogate(register uint32_t code)`
			`{`
			`return ((((code - 0x10000) << 6) & 0x3ff0000) \|`
			`((code - 0x10000) & 0x3ff) \| 0xd800dc00);`
			`}`

			`static uint32_t`
			`decode_surrogate(register const u_char *ucs)`
			`{`
			`return ((((ucs[0] & 0x3) << 18) \| (ucs[1] << 10) \|`
			`((ucs[2] & 0x3) << 8) \| ucs[3]) + 0x10000);`
			`}`