freebsd-nq/sys/libkern/iconv_ucs.c

/*-
 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
 *
 * Copyright (c) 2003, 2005 Ryuichiro Imura
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");

#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/iconv.h>

#include "iconv_converter_if.h"

/*
 * "UCS" converter
 */

#define	KICONV_UCS_COMBINE	0x1
#define	KICONV_UCS_FROM_UTF8	0x2
#define	KICONV_UCS_TO_UTF8	0x4
#define	KICONV_UCS_FROM_LE	0x8
#define	KICONV_UCS_TO_LE	0x10
#define	KICONV_UCS_FROM_UTF16	0x20
#define	KICONV_UCS_TO_UTF16	0x40
#define	KICONV_UCS_UCS4		0x80

#define	ENCODING_UTF16	"UTF-16BE"
#define	ENCODING_UTF8	"UTF-8"

static struct {
	const char *name;
	int from_flag, to_flag;
} unicode_family[] = {
	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
	{ NULL,		0,	0 }
};

static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
static uint32_t encode_surrogate(uint32_t code);
static uint32_t decode_surrogate(const u_char *ucs);

#ifdef MODULE_DEPEND
MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
#endif

/*
 * UCS converter instance
 */
struct iconv_ucs {
	KOBJ_FIELDS;
	int			convtype;
	struct iconv_cspair *	d_csp;
	struct iconv_cspair *	d_cspf;
	void *			f_ctp;
	void *			t_ctp;
	void *			ctype;
};

static int
iconv_ucs_open(struct iconv_converter_class *dcp,
	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
{
	struct iconv_ucs *dp;
	int i;
	const char *from, *to;

	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
	to = csp->cp_to;
	from = cspf ? cspf->cp_from : csp->cp_from;

	dp->convtype = 0;

	if (cspf)
		dp->convtype |= KICONV_UCS_COMBINE;
	for (i = 0; unicode_family[i].name; i++) {
		if (strcasecmp(from, unicode_family[i].name) == 0)
			dp->convtype |= unicode_family[i].from_flag;
		if (strcasecmp(to, unicode_family[i].name) == 0)
			dp->convtype |= unicode_family[i].to_flag;
	}
	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
		dp->convtype |= KICONV_UCS_UCS4;
	else
		dp->convtype &= ~KICONV_UCS_UCS4;

	dp->f_ctp = dp->t_ctp = NULL;
	if (dp->convtype & KICONV_UCS_COMBINE) {
		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
		}
		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
		}
	}

	dp->ctype = NULL;
	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);

	dp->d_csp = csp;
	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
		if (cspf) {
			dp->d_cspf = cspf;
			cspf->cp_refcount++;
		} else
			csp->cp_refcount++;
	}
	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
		csp->cp_refcount++;
	*dpp = (void*)dp;
	return 0;
}

static int
iconv_ucs_close(void *data)
{
	struct iconv_ucs *dp = data;

	if (dp->f_ctp)
		iconv_close(dp->f_ctp);
	if (dp->t_ctp)
		iconv_close(dp->t_ctp);
	if (dp->ctype)
		iconv_close(dp->ctype);
	if (dp->d_cspf)
		dp->d_cspf->cp_refcount--;
	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
		dp->d_csp->cp_refcount--;
	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
		dp->d_csp->cp_refcount--;
	kobj_delete((struct kobj*)data, M_ICONV);
	return 0;
}

static int
iconv_ucs_conv(void *d2p, const char **inbuf,
	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
	int convchar, int casetype)
{
	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
	int ret = 0, i;
	size_t in, on, ir, or, inlen, outlen, ucslen;
	const char *src, *p;
	char *dst;
	u_char ucs[4], *q;
	uint32_t code;

	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
		return 0;
	ir = in = *inbytesleft;
	or = on = *outbytesleft;
	src = *inbuf;
	dst = *outbuf;

	while (ir > 0 && or > 0) {

		/*
		 * The first half of conversion.
		 * (convert any code into ENCODING_UNICODE)
		 */
		code = 0;
		p = src;
		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
			/* convert UTF-8 to ENCODING_UNICODE */
			inlen = 0;
			code = utf8_to_ucs4(p, &inlen, ir);
			if (code == 0) {
				ret = -1;
				break;
			}

			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
				code = towlower(code, dp->ctype);
			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
				code = towupper(code, dp->ctype);
			}

			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
				/* reserved for utf-16 surrogate pair */
				/* invalid unicode */
				ret = -1;
				break;
			}

			if (inlen == 4) {
				if (dp->convtype & KICONV_UCS_UCS4) {
					ucslen = 4;
					code = encode_surrogate(code);
				} else {
					/* can't handle with ucs-2 */
					ret = -1;
					break;
				}
			} else {
				ucslen = 2;
			}

			/* save UCS-4 into ucs[] */
			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
				*q++ = (code >> (i << 3)) & 0xff;

		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
			/* convert local code to ENCODING_UNICODE */
			ucslen = 4;
			inlen = ir;
			q = ucs;
			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
			if (ret)
				break;
			inlen = ir - inlen;
			ucslen = 4 - ucslen;

		} else {
			/* src code is a proper subset of ENCODING_UNICODE */
			q = ucs;
			if (dp->convtype & KICONV_UCS_FROM_LE) {
				*q = *(p + 1);
				*(q + 1) = *p;
				p += 2;
			} else {
				*q = *p++;
				*(q + 1) = *p++;
			}
			if ((*q & 0xfc) == 0xd8) {
				if (dp->convtype & KICONV_UCS_UCS4 &&
				    dp->convtype & KICONV_UCS_FROM_UTF16) {
					inlen = ucslen = 4;
				} else {
					/* invalid unicode */
					ret = -1;
					break;
				}
			} else {
				inlen = ucslen = 2;
			}
			if (ir < inlen) {
				ret = -1;
				break;
			}
			if (ucslen == 4) {
				q += 2;
				if (dp->convtype & KICONV_UCS_FROM_LE) {
					*q = *(p + 1);
					*(q + 1) = *p;
				} else {
					*q = *p++;
					*(q + 1) = *p;
				}
				if ((*q & 0xfc) != 0xdc) {
					/* invalid unicode */
					ret = -1;
					break;
				}
			}
		}

		/*
		 * The second half of conversion.
		 * (convert ENCODING_UNICODE into any code)
		 */
		p = ucs;
		if (dp->convtype & KICONV_UCS_TO_UTF8) {
			q = (u_char *)dst;
			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
				/* decode surrogate pair */
				code = decode_surrogate(p);
			} else {
				code = (ucs[0] << 8) | ucs[1];
			}

			if (casetype == KICONV_LOWER && dp->ctype) {
				code = towlower(code, dp->ctype);
			} else if (casetype == KICONV_UPPER && dp->ctype) {
				code = towupper(code, dp->ctype);
			}

			outlen = 0;
			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
				ret = -1;
				break;
			}

			src += inlen;
			ir -= inlen;
			dst += outlen;
			or -= outlen;

		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
			if (ret)
				break;

			src += inlen;
			ir -= inlen;

		} else {
			/* dst code is a proper subset of ENCODING_UNICODE */
			if (or < ucslen) {
				ret = -1;
				break;
			}
			src += inlen;
			ir -= inlen;
			or -= ucslen;
			if (dp->convtype & KICONV_UCS_TO_LE) {
				*dst++ = *(p + 1);
				*dst++ = *p;
				p += 2;
			} else {
				*dst++ = *p++;
				*dst++ = *p++;
			}
			if (ucslen == 4) {
				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
					ret = -1;
					break;
				}
				if (dp->convtype & KICONV_UCS_TO_LE) {
					*dst++ = *(p + 1);
					*dst++ = *p;
				} else {
					*dst++ = *p++;
					*dst++ = *p;
				}
			}
		}

		if (convchar == 1)
			break;
	}

	*inbuf += in - ir;
	*outbuf += on - or;
	*inbytesleft -= in - ir;
	*outbytesleft -= on - or;
	return (ret);
}

static int
iconv_ucs_init(struct iconv_converter_class *dcp)
{
	int error;

	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
	if (error)
		return (error);
	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
	if (error)
		return (error);
	return (0);
}

static int
iconv_ucs_done(struct iconv_converter_class *dcp)
{
	return (0);
}

static const char *
iconv_ucs_name(struct iconv_converter_class *dcp)
{
	return (ENCODING_UNICODE);
}

static kobj_method_t iconv_ucs_methods[] = {
	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
	{0, 0}
};

KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));

static uint32_t
utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
{
	size_t i, w = 0;
	uint32_t ucs4 = 0;

	/*
	 * get leading 1 byte from utf-8
	 */
	if ((*src & 0x80) == 0) {
		/*
		 * leading 1 bit is "0"
		 *  utf-8: 0xxxxxxx
		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
		 */
		w = 1;
		/* get trailing 7 bits */
		ucs4 = *src & 0x7f;
	} else if ((*src & 0xe0) == 0xc0) {
		/*
		 * leading 3 bits are "110"
		 *  utf-8: 110xxxxx 10yyyyyy
		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
		 */
		w = 2;
		/* get trailing 5 bits */
		ucs4 = *src & 0x1f;
	} else if ((*src & 0xf0) == 0xe0) {
		/*
		 * leading 4 bits are "1110"
		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
		 */
		w = 3;
		/* get trailing 4 bits */
		ucs4 = *src & 0x0f;
	} else if ((*src & 0xf8) == 0xf0) {
		/*
		 * leading 5 bits are "11110"
		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
		 */
		w = 4;
		/* get trailing 3 bits */
		ucs4 = *src & 0x07;
	} else {
		/* out of utf-16 range or having illegal bits */
		return (0);
	}

	if (srclen < w)
		return (0);

	/*
	 * get left parts from utf-8
	 */
	for (i = 1 ; i < w ; i++) {
		if ((*(src + i) & 0xc0) != 0x80) {
			/* invalid: leading 2 bits are not "10" */
			return (0);
		}
		/* concatenate trailing 6 bits into ucs4 */
		ucs4 <<= 6;
		ucs4 |= *(src + i) & 0x3f;
	}

	*utf8width = w;
	return (ucs4);
}

static u_char *
ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
{
	u_char lead, *p;
	size_t i, w;

	/*
	 * determine utf-8 width and leading bits
	 */
	if (ucs4 < 0x80) {
		w = 1;
		lead = 0;	/* "0" */
	} else if (ucs4 < 0x800) {
		w = 2;
		lead = 0xc0;	/* "11" */
	} else if (ucs4 < 0x10000) {
		w = 3;
		lead = 0xe0;	/* "111" */
	} else if (ucs4 < 0x200000) {
		w = 4;
		lead = 0xf0;	/* "1111" */
	} else {
		return (NULL);
	}

	if (dstlen < w)
		return (NULL);

	/*
	 * construct utf-8
	 */
	p = dst;
	for (i = w - 1 ; i >= 1 ; i--) {
		/* get trailing 6 bits and put it with leading bit as "1" */
		*(p + i) = (ucs4 & 0x3f) | 0x80;
		ucs4 >>= 6;
	}
	*p = ucs4 | lead;

	*utf8width = w;

	return (p);
}

static uint32_t
encode_surrogate(uint32_t code)
{
	return ((((code - 0x10000) << 6) & 0x3ff0000) |
	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
}

static uint32_t
decode_surrogate(const u_char *ucs)
{
	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
}
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`/*-`
sys/kern: adoption of SPDX licensing ID tags. Mainly focus on files that use BSD 2-Clause license, however the tool I was using misidentified many licenses so this was mostly a manual - error prone - task. The Software Package Data Exchange (SPDX) group provides a specification to make it easier for automated tools to detect and summarize well known opensource licenses. We are gradually adopting the specification, noting that the tags are considered only advisory and do not, in any way, superceed or replace the license texts. 2017-11-27 15:20:12 +00:00			`* SPDX-License-Identifier: BSD-2-Clause-FreeBSD`
			`*`
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`* Copyright (c) 2003, 2005 Ryuichiro Imura`
			`* All rights reserved.`
			`*`
			`* Redistribution and use in source and binary forms, with or without`
			`* modification, are permitted provided that the following conditions`
			`* are met:`
			`* 1. Redistributions of source code must retain the above copyright`
			`* notice, this list of conditions and the following disclaimer.`
			`* 2. Redistributions in binary form must reproduce the above copyright`
			`* notice, this list of conditions and the following disclaimer in the`
			`* documentation and/or other materials provided with the distribution.`
			`*`
			* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
			`* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE`
			`* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE`
			`* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE`
			`* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL`
			`* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS`
			`* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)`
			`* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT`
			`* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY`
			`* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF`
			`* SUCH DAMAGE.`
			`*/`

			`#include <sys/cdefs.h>`
			`__FBSDID("$FreeBSD$");`

			`#include <sys/param.h>`
			`#include <sys/kernel.h>`
			`#include <sys/systm.h>`
			`#include <sys/malloc.h>`
			`#include <sys/iconv.h>`

			`#include "iconv_converter_if.h"`

			`/*`
			`* "UCS" converter`
			`*/`

			`#define KICONV_UCS_COMBINE 0x1`
			`#define KICONV_UCS_FROM_UTF8 0x2`
			`#define KICONV_UCS_TO_UTF8 0x4`
			`#define KICONV_UCS_FROM_LE 0x8`
			`#define KICONV_UCS_TO_LE 0x10`
			`#define KICONV_UCS_FROM_UTF16 0x20`
			`#define KICONV_UCS_TO_UTF16 0x40`
			`#define KICONV_UCS_UCS4 0x80`

			`#define ENCODING_UTF16 "UTF-16BE"`
			`#define ENCODING_UTF8 "UTF-8"`

			`static struct {`
			`const char *name;`
			`int from_flag, to_flag;`
			`} unicode_family[] = {`
			`{ "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },`
			`{ "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },`
			`{ "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },`
			`{ "UTF-16LE", KICONV_UCS_FROM_UTF16\|KICONV_UCS_FROM_LE,`
			`KICONV_UCS_TO_UTF16\|KICONV_UCS_TO_LE },`
			`{ NULL, 0, 0 }`
			`};`

			`static uint32_t utf8_to_ucs4(const char src, size_t utf8width, size_t srclen);`
			`static u_char ucs4_to_utf8(uint32_t ucs4, char dst, size_t *utf8width, size_t dstlen);`
			`static uint32_t encode_surrogate(uint32_t code);`
			`static uint32_t decode_surrogate(const u_char *ucs);`

			`#ifdef MODULE_DEPEND`
			`MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);`
			`#endif`

			`/*`
			`* UCS converter instance`
			`*/`
			`struct iconv_ucs {`
			`KOBJ_FIELDS;`
			`int convtype;`
			`struct iconv_cspair * d_csp;`
			`struct iconv_cspair * d_cspf;`
			`void * f_ctp;`
			`void * t_ctp;`
			`void * ctype;`
			`};`

			`static int`
			`iconv_ucs_open(struct iconv_converter_class *dcp,`
			`struct iconv_cspair csp, struct iconv_cspair cspf, void **dpp)`
			`{`
			`struct iconv_ucs *dp;`
			`int i;`
			`const char from, to;`

			`dp = (struct iconv_ucs )kobj_create((struct kobj_class)dcp, M_ICONV, M_WAITOK);`
			`to = csp->cp_to;`
			`from = cspf ? cspf->cp_from : csp->cp_from;`

			`dp->convtype = 0;`

			`if (cspf)`
			`dp->convtype \|= KICONV_UCS_COMBINE;`
			`for (i = 0; unicode_family[i].name; i++) {`
Use strcasecmp() instead of strcmp() when checking user-supplied encoding names so that encoding names are treated as case-insensitive. This allows the use of 'utf-8' instead of 'UTF-8' for example and matches the behavior of iconv(1). PR: 167977 Submitted by: buganini@gmail.com MFC after: 1 week 2014-06-09 19:27:47 +00:00			`if (strcasecmp(from, unicode_family[i].name) == 0)`
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`dp->convtype \|= unicode_family[i].from_flag;`
Use strcasecmp() instead of strcmp() when checking user-supplied encoding names so that encoding names are treated as case-insensitive. This allows the use of 'utf-8' instead of 'UTF-8' for example and matches the behavior of iconv(1). PR: 167977 Submitted by: buganini@gmail.com MFC after: 1 week 2014-06-09 19:27:47 +00:00			`if (strcasecmp(to, unicode_family[i].name) == 0)`
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`dp->convtype \|= unicode_family[i].to_flag;`
			`}`
Use strcmp that I replaced by accident. 2012-05-21 02:45:47 +00:00			`if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)`
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`dp->convtype \|= KICONV_UCS_UCS4;`
			`else`
			`dp->convtype &= ~KICONV_UCS_UCS4;`

			`dp->f_ctp = dp->t_ctp = NULL;`
			`if (dp->convtype & KICONV_UCS_COMBINE) {`
			`if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&`
			`(dp->convtype & KICONV_UCS_FROM_LE) == 0) {`
			`iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);`
			`}`
			`if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&`
			`(dp->convtype & KICONV_UCS_TO_LE) == 0) {`
			`iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);`
			`}`
			`}`

			`dp->ctype = NULL;`
			`if (dp->convtype & (KICONV_UCS_FROM_UTF8 \| KICONV_UCS_TO_UTF8))`
			`iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);`

			`dp->d_csp = csp;`
			`if (dp->convtype & (KICONV_UCS_FROM_UTF8 \| KICONV_UCS_FROM_LE)) {`
			`if (cspf) {`
			`dp->d_cspf = cspf;`
			`cspf->cp_refcount++;`
			`} else`
			`csp->cp_refcount++;`
			`}`
			`if (dp->convtype & (KICONV_UCS_TO_UTF8 \| KICONV_UCS_TO_LE))`
			`csp->cp_refcount++;`
			`dpp = (void)dp;`
			`return 0;`
			`}`

			`static int`
			`iconv_ucs_close(void *data)`
			`{`
			`struct iconv_ucs *dp = data;`

			`if (dp->f_ctp)`
			`iconv_close(dp->f_ctp);`
			`if (dp->t_ctp)`
			`iconv_close(dp->t_ctp);`
			`if (dp->ctype)`
			`iconv_close(dp->ctype);`
			`if (dp->d_cspf)`
			`dp->d_cspf->cp_refcount--;`
			`else if (dp->convtype & (KICONV_UCS_FROM_UTF8 \| KICONV_UCS_FROM_LE))`
			`dp->d_csp->cp_refcount--;`
			`if (dp->convtype & (KICONV_UCS_TO_UTF8 \| KICONV_UCS_TO_LE))`
			`dp->d_csp->cp_refcount--;`
			`kobj_delete((struct kobj*)data, M_ICONV);`
			`return 0;`
			`}`

			`static int`
			`iconv_ucs_conv(void d2p, const char *inbuf,`
			`size_t inbytesleft, char outbuf, size_t outbytesleft,`
			`int convchar, int casetype)`
			`{`
			`struct iconv_ucs dp = (struct iconv_ucs)d2p;`
			`int ret = 0, i;`
			`size_t in, on, ir, or, inlen, outlen, ucslen;`
			`const char src, p;`
			`char *dst;`
			`u_char ucs[4], *q;`
			`uint32_t code;`

			`if (inbuf == NULL \|\| inbuf == NULL \|\| outbuf == NULL \|\| outbuf == NULL)`
			`return 0;`
			`ir = in = *inbytesleft;`
			`or = on = *outbytesleft;`
			`src = *inbuf;`
			`dst = *outbuf;`

			`while (ir > 0 && or > 0) {`

			`/*`
			`* The first half of conversion.`
			`* (convert any code into ENCODING_UNICODE)`
			`*/`
			`code = 0;`
			`p = src;`
			`if (dp->convtype & KICONV_UCS_FROM_UTF8) {`
			`/* convert UTF-8 to ENCODING_UNICODE */`
			`inlen = 0;`
			`code = utf8_to_ucs4(p, &inlen, ir);`
			`if (code == 0) {`
			`ret = -1;`
			`break;`
			`}`

			`if (casetype == KICONV_FROM_LOWER && dp->ctype) {`
			`code = towlower(code, dp->ctype);`
			`} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {`
			`code = towupper(code, dp->ctype);`
			`}`

			`if ((code >= 0xd800 && code < 0xe000) \|\| code >= 0x110000 ) {`
			`/* reserved for utf-16 surrogate pair */`
			`/* invalid unicode */`
			`ret = -1;`
			`break;`
			`}`

			`if (inlen == 4) {`
			`if (dp->convtype & KICONV_UCS_UCS4) {`
			`ucslen = 4;`
			`code = encode_surrogate(code);`
			`} else {`
			`/* can't handle with ucs-2 */`
			`ret = -1;`
			`break;`
			`}`
			`} else {`
			`ucslen = 2;`
			`}`

			`/* save UCS-4 into ucs[] */`
			`for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)`
			`*q++ = (code >> (i << 3)) & 0xff;`

			`} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {`
			`/* convert local code to ENCODING_UNICODE */`
			`ucslen = 4;`
			`inlen = ir;`
			`q = ucs;`
			`ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,`
			`&ucslen, casetype & (KICONV_FROM_LOWER \| KICONV_FROM_UPPER));`
			`if (ret)`
			`break;`
			`inlen = ir - inlen;`
			`ucslen = 4 - ucslen;`

			`} else {`
			`/* src code is a proper subset of ENCODING_UNICODE */`
			`q = ucs;`
			`if (dp->convtype & KICONV_UCS_FROM_LE) {`
			`q = (p + 1);`
			`(q + 1) = p;`
			`p += 2;`
			`} else {`
			`q = p++;`
			`(q + 1) = p++;`
			`}`
			`if ((*q & 0xfc) == 0xd8) {`
			`if (dp->convtype & KICONV_UCS_UCS4 &&`
			`dp->convtype & KICONV_UCS_FROM_UTF16) {`
			`inlen = ucslen = 4;`
			`} else {`
			`/* invalid unicode */`
			`ret = -1;`
			`break;`
			`}`
			`} else {`
			`inlen = ucslen = 2;`
			`}`
			`if (ir < inlen) {`
			`ret = -1;`
			`break;`
			`}`
			`if (ucslen == 4) {`
			`q += 2;`
			`if (dp->convtype & KICONV_UCS_FROM_LE) {`
			`q = (p + 1);`
			`(q + 1) = p;`
			`} else {`
			`q = p++;`
			`(q + 1) = p;`
			`}`
			`if ((*q & 0xfc) != 0xdc) {`
			`/* invalid unicode */`
			`ret = -1;`
			`break;`
			`}`
			`}`
			`}`

			`/*`
			`* The second half of conversion.`
			`* (convert ENCODING_UNICODE into any code)`
			`*/`
			`p = ucs;`
			`if (dp->convtype & KICONV_UCS_TO_UTF8) {`
			`q = (u_char *)dst;`
			`if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {`
			`/* decode surrogate pair */`
			`code = decode_surrogate(p);`
			`} else {`
			`code = (ucs[0] << 8) \| ucs[1];`
			`}`

			`if (casetype == KICONV_LOWER && dp->ctype) {`
			`code = towlower(code, dp->ctype);`
			`} else if (casetype == KICONV_UPPER && dp->ctype) {`
			`code = towupper(code, dp->ctype);`
			`}`

			`outlen = 0;`
			`if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {`
			`ret = -1;`
			`break;`
			`}`

			`src += inlen;`
			`ir -= inlen;`
			`dst += outlen;`
			`or -= outlen;`

			`} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {`
			`ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,`
			`&or, casetype & (KICONV_LOWER \| KICONV_UPPER));`
			`if (ret)`
			`break;`

			`src += inlen;`
			`ir -= inlen;`

			`} else {`
			`/* dst code is a proper subset of ENCODING_UNICODE */`
			`if (or < ucslen) {`
			`ret = -1;`
			`break;`
			`}`
			`src += inlen;`
			`ir -= inlen;`
			`or -= ucslen;`
			`if (dp->convtype & KICONV_UCS_TO_LE) {`
			`dst++ = (p + 1);`
			`dst++ = p;`
			`p += 2;`
			`} else {`
			`dst++ = p++;`
			`dst++ = p++;`
			`}`
			`if (ucslen == 4) {`
			`if ((dp->convtype & KICONV_UCS_UCS4) == 0 \|\|`
			`(dp->convtype & KICONV_UCS_TO_UTF16) == 0) {`
			`ret = -1;`
			`break;`
			`}`
			`if (dp->convtype & KICONV_UCS_TO_LE) {`
			`dst++ = (p + 1);`
			`dst++ = p;`
			`} else {`
			`dst++ = p++;`
			`dst++ = p;`
			`}`
			`}`
			`}`

			`if (convchar == 1)`
			`break;`
			`}`

			`*inbuf += in - ir;`
			`*outbuf += on - or;`
			`*inbytesleft -= in - ir;`
			`*outbytesleft -= on - or;`
			`return (ret);`
			`}`

			`static int`
			`iconv_ucs_init(struct iconv_converter_class *dcp)`
			`{`
			`int error;`

			`error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);`
			`if (error)`
			`return (error);`
			`error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);`
			`if (error)`
			`return (error);`
			`return (0);`
			`}`

			`static int`
			`iconv_ucs_done(struct iconv_converter_class *dcp)`
			`{`
			`return (0);`
			`}`

			`static const char *`
			`iconv_ucs_name(struct iconv_converter_class *dcp)`
			`{`
			`return (ENCODING_UNICODE);`
			`}`

			`static kobj_method_t iconv_ucs_methods[] = {`
			`KOBJMETHOD(iconv_converter_open, iconv_ucs_open),`
			`KOBJMETHOD(iconv_converter_close, iconv_ucs_close),`
			`KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),`
			`KOBJMETHOD(iconv_converter_init, iconv_ucs_init),`
			`KOBJMETHOD(iconv_converter_done, iconv_ucs_done),`
			`KOBJMETHOD(iconv_converter_name, iconv_ucs_name),`
			`{0, 0}`
			`};`

			`KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));`

			`static uint32_t`
			`utf8_to_ucs4(const char src, size_t utf8width, size_t srclen)`
			`{`
			`size_t i, w = 0;`
			`uint32_t ucs4 = 0;`

			`/*`
			`* get leading 1 byte from utf-8`
			`*/`
			`if ((*src & 0x80) == 0) {`
			`/*`
			`* leading 1 bit is "0"`
			`* utf-8: 0xxxxxxx`
			`* ucs-4: 00000000 00000000 00000000 0xxxxxxx`
			`*/`
			`w = 1;`
			`/* get trailing 7 bits */`
			`ucs4 = *src & 0x7f;`
			`} else if ((*src & 0xe0) == 0xc0) {`
			`/*`
			`* leading 3 bits are "110"`
			`* utf-8: 110xxxxx 10yyyyyy`
			`* ucs-4: 00000000 00000000 00000xxx xxyyyyyy`
			`*/`
			`w = 2;`
			`/* get trailing 5 bits */`
			`ucs4 = *src & 0x1f;`
			`} else if ((*src & 0xf0) == 0xe0) {`
			`/*`
			`* leading 4 bits are "1110"`
			`* utf-8: 1110xxxx 10yyyyyy 10zzzzzz`
			`* ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz`
			`*/`
			`w = 3;`
			`/* get trailing 4 bits */`
			`ucs4 = *src & 0x0f;`
			`} else if ((*src & 0xf8) == 0xf0) {`
			`/*`
			`* leading 5 bits are "11110"`
			`* utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz`
			`* ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz`
			`*/`
			`w = 4;`
			`/* get trailing 3 bits */`
			`ucs4 = *src & 0x07;`
			`} else {`
			`/* out of utf-16 range or having illegal bits */`
			`return (0);`
			`}`

			`if (srclen < w)`
			`return (0);`

			`/*`
			`* get left parts from utf-8`
			`*/`
			`for (i = 1 ; i < w ; i++) {`
			`if ((*(src + i) & 0xc0) != 0x80) {`
			`/* invalid: leading 2 bits are not "10" */`
			`return (0);`
			`}`
			`/* concatenate trailing 6 bits into ucs4 */`
			`ucs4 <<= 6;`
			`ucs4 \|= *(src + i) & 0x3f;`
			`}`

			`*utf8width = w;`
			`return (ucs4);`
			`}`

			`static u_char *`
			`ucs4_to_utf8(uint32_t ucs4, char dst, size_t utf8width, size_t dstlen)`
			`{`
			`u_char lead, *p;`
			`size_t i, w;`

			`/*`
			`* determine utf-8 width and leading bits`
			`*/`
			`if (ucs4 < 0x80) {`
			`w = 1;`
			`lead = 0; /* "0" */`
			`} else if (ucs4 < 0x800) {`
			`w = 2;`
			`lead = 0xc0; /* "11" */`
			`} else if (ucs4 < 0x10000) {`
			`w = 3;`
			`lead = 0xe0; /* "111" */`
			`} else if (ucs4 < 0x200000) {`
			`w = 4;`
			`lead = 0xf0; /* "1111" */`
			`} else {`
			`return (NULL);`
			`}`

			`if (dstlen < w)`
			`return (NULL);`

			`/*`
			`* construct utf-8`
			`*/`
			`p = dst;`
			`for (i = w - 1 ; i >= 1 ; i--) {`
			`/* get trailing 6 bits and put it with leading bit as "1" */`
			`*(p + i) = (ucs4 & 0x3f) \| 0x80;`
			`ucs4 >>= 6;`
			`}`
			`*p = ucs4 \| lead;`

			`*utf8width = w;`

			`return (p);`
			`}`

			`static uint32_t`
libkern: Remove obsolete 'register' keyword Sponsored by: Dell EMC Isilon 2017-01-12 17:02:29 +00:00			`encode_surrogate(uint32_t code)`
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`{`
			`return ((((code - 0x10000) << 6) & 0x3ff0000) \|`
			`((code - 0x10000) & 0x3ff) \| 0xd800dc00);`
			`}`

			`static uint32_t`
libkern: Remove obsolete 'register' keyword Sponsored by: Dell EMC Isilon 2017-01-12 17:02:29 +00:00			`decode_surrogate(const u_char *ucs)`
Add unicode support to msdosfs and smbfs; original pathes from imura, bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work. 2011-11-18 03:05:20 +00:00			`{`
			`return ((((ucs[0] & 0x3) << 18) \| (ucs[1] << 10) \|`
			`((ucs[2] & 0x3) << 8) \| ucs[3]) + 0x10000);`
			`}`