freebsd-dev/contrib/nvi/common/encoding.c

/*-
 * Copyright (c) 2011, 2012
 *	Zhihao Yuan.  All rights reserved.
 *
 * See the LICENSE file for redistribution information.
 */

#ifndef lint
static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";
#endif /* not lint */

#include <sys/types.h>

int looks_utf8 __P((const char *, size_t));
int looks_utf16 __P((const char *, size_t));
int decode_utf8 __P((const char *));
int decode_utf16 __P((const char *, int));

#define F 0   /* character never appears in text */
#define T 1   /* character appears in plain ASCII text */
#define I 2   /* character appears in ISO-8859 text */
#define X 3   /* character appears in non-ISO extended ASCII (Mac, IBM PC) */

static char text_chars[256] = {
	/*                  BEL BS HT LF    FF CR    */
	F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F,  /* 0x0X */
	/*                              ESC          */
	F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F,  /* 0x1X */
	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x2X */
	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x3X */
	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x4X */
	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x5X */
	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T,  /* 0x6X */
	T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F,  /* 0x7X */
	/*            NEL                            */
	X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X,  /* 0x8X */
	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,  /* 0x9X */
	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xaX */
	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xbX */
	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xcX */
	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xdX */
	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,  /* 0xeX */
	I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I   /* 0xfX */
};

/*
 * looks_utf8 --
 *  Decide whether some text looks like UTF-8. Returns:
 *
 *     -1: invalid UTF-8
 *      0: uses odd control characters, so doesn't look like text
 *      1: 7-bit text
 *      2: definitely UTF-8 text (valid high-bit set bytes)
 *
 *  Based on RFC 3629. UTF-8 with BOM is not accepted.
 *
 * PUBLIC: int looks_utf8 __P((const char *, size_t));
 */
int
looks_utf8(const char *ibuf, size_t nbytes)
{
	const u_char *buf = (u_char *)ibuf;
	size_t i;
	int n;
	int gotone = 0, ctrl = 0;

	for (i = 0; i < nbytes; i++) {
		if ((buf[i] & 0x80) == 0) {	   /* 0xxxxxxx is plain ASCII */
			/*
			 * Even if the whole file is valid UTF-8 sequences,
			 * still reject it if it uses weird control characters.
			 */

			if (text_chars[buf[i]] != T)
				ctrl = 1;
		} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
			return -1;
		} else {			   /* 11xxxxxx begins UTF-8 */
			int following;

			if ((buf[i] & 0x20) == 0)	/* 110xxxxx */
				if (buf[i] > 0xC1)	/* C0, C1 */
					following = 1;
				else return -1;
			else if ((buf[i] & 0x10) == 0)	/* 1110xxxx */
				following = 2;
			else if ((buf[i] & 0x08) == 0)	/* 11110xxx */
				if (buf[i] < 0xF5)
					following = 3;
				else return -1;		/* F5, F6, F7 */
			else
				return -1;		/* F8~FF */

			for (n = 0; n < following; n++) {
				i++;
				if (i >= nbytes)
					goto done;

				if (buf[i] & 0x40)	/* 10xxxxxx */
					return -1;
			}

			gotone = 1;
		}
	}
done:
	return ctrl ? 0 : (gotone ? 2 : 1);
}

/*
 * looks_utf16 --
 *  Decide whether some text looks like UTF-16. Returns:
 *
 *      0: invalid UTF-16
 *      1: Little-endian UTF-16
 *      2: Big-endian UTF-16
 *
 * PUBLIC: int looks_utf16 __P((const char *, size_t));
 */
int
looks_utf16(const char *ibuf, size_t nbytes)
{
	const u_char *buf = (u_char *)ibuf;
	int bigend;
	size_t i;
	unsigned int c;
	int bom;
	int following = 0;

	if (nbytes < 2)
		return 0;

	bom = buf[0] << 8 ^ buf[1];
	if (bom == 0xFFFE)
		bigend = 0;
	else if (bom == 0xFEFF)
		bigend = 1;
	else
		return 0;

	for (i = 2; i + 1 < nbytes; i += 2) {
		if (bigend)
			c = buf[i] << 8 ^ buf[i + 1];
		else
			c = buf[i] ^ buf[i + 1] << 8;

		if (!following)
			if (c < 0xD800 || c > 0xDFFF)
				if (c < 128 && text_chars[c] != T)
					return 0;
				else
					following = 0;
			else if (c > 0xDBFF)
				return 0;
			else {
				following = 1;
				continue;
			}
		else if (c < 0xDC00 || c > 0xDFFF)
			return 0;
	}

	return 1 + bigend;
}

#undef F
#undef T
#undef I
#undef X

/*
 * decode_utf8 --
 *  Decode a UTF-8 character from byte string to Unicode.
 *  Returns -1 if the first byte is a not UTF-8 leader.
 *
 *  Based on RFC 3629, but without error detection.
 *
 * PUBLIC: int decode_utf8 __P((const char *));
 */
int decode_utf8(const char *ibuf) {
	const u_char *buf = (u_char *)ibuf;
	int u = -1;

	if ((buf[0] & 0x80) == 0)
		u = buf[0];
	else if ((buf[0] & 0x40) == 0);
	else {
		if ((buf[0] & 0x20) == 0)
			u = (buf[0] ^ 0xC0) <<  6 ^ (buf[1] ^ 0x80);
		else if ((buf[0] & 0x10) == 0)
			u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) <<  6
			  ^ (buf[2] ^ 0x80);
		else if (((buf[0] & 0x08) == 0))
			u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
			  ^ (buf[2] ^ 0x80) <<  6 ^ (buf[3] ^ 0x80);
	}
	return u;
}

/*
 * decode_utf16 --
 *  Decode a UTF-16 character from byte string to Unicode.
 *  Returns -1 if the first unsigned integer is invalid.
 *
 *  No error detection on supplementary bytes.
 *
 * PUBLIC: int decode_utf16 __P((const char *, int));
 */
int decode_utf16(const char* ibuf, int bigend) {
	const u_char *buf = (u_char *)ibuf;
	int u = -1;
	unsigned int w1, w2;

	if (bigend)
		w1 = buf[0] << 8 ^ buf[1];
	else
		w1 = buf[0] ^ buf[1] << 8;

	if (w1 < 0xD800 || w1 > 0xDFFF)
		u = w1;
	else if (w1 > 0xDBFF);
	else {
		if (bigend)
			w2 = buf[2] << 8 ^ buf[3];
		else
			w2 = buf[2] ^ buf[3] << 8;
		u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
	}
	return u;
}
Import nvi-2.1.1-4334a8297f into the work area. This is the gsoc-2011 project to clean up and backport multibyte support from other nvi forks in a form we can use. GSoC info: http://www.google-melange.com/gsoc/proposal/review/google/gsoc2011/zy/1 Repo at: https://github.com/lichray/nvi2 Obtained from: Zhihao Yuan <lichray@gmail.com> 2013-08-11 09:44:58 +00:00			`/*-`
			`* Copyright (c) 2011, 2012`
			`* Zhihao Yuan. All rights reserved.`
			`*`
			`* See the LICENSE file for redistribution information.`
			`*/`

			`#ifndef lint`
			`static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";`
			`#endif /* not lint */`

			`#include <sys/types.h>`

			`int looks_utf8 __P((const char *, size_t));`
			`int looks_utf16 __P((const char *, size_t));`
			`int decode_utf8 __P((const char *));`
			`int decode_utf16 __P((const char *, int));`

			`#define F 0 /* character never appears in text */`
			`#define T 1 /* character appears in plain ASCII text */`
			`#define I 2 /* character appears in ISO-8859 text */`
			`#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */`

			`static char text_chars[256] = {`
			`/* BEL BS HT LF FF CR */`
			`F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */`
			`/* ESC */`
			`F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */`
			`T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */`
			`T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */`
			`T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */`
			`T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */`
			`T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */`
			`T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */`
			`/* NEL */`
			`X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */`
			`X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */`
			`I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */`
			`I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */`
			`I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */`
			`I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */`
			`I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */`
			`I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */`
			`};`

			`/*`
			`* looks_utf8 --`
			`* Decide whether some text looks like UTF-8. Returns:`
			`*`
			`* -1: invalid UTF-8`
			`* 0: uses odd control characters, so doesn't look like text`
			`* 1: 7-bit text`
			`* 2: definitely UTF-8 text (valid high-bit set bytes)`
			`*`
			`* Based on RFC 3629. UTF-8 with BOM is not accepted.`
			`*`
			`* PUBLIC: int looks_utf8 __P((const char *, size_t));`
			`*/`
			`int`
			`looks_utf8(const char *ibuf, size_t nbytes)`
			`{`
			`const u_char buf = (u_char )ibuf;`
			`size_t i;`
			`int n;`
			`int gotone = 0, ctrl = 0;`

			`for (i = 0; i < nbytes; i++) {`
			`if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */`
			`/*`
			`* Even if the whole file is valid UTF-8 sequences,`
			`* still reject it if it uses weird control characters.`
			`*/`

			`if (text_chars[buf[i]] != T)`
			`ctrl = 1;`
			`} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */`
			`return -1;`
			`} else { /* 11xxxxxx begins UTF-8 */`
			`int following;`

			`if ((buf[i] & 0x20) == 0) /* 110xxxxx */`
			`if (buf[i] > 0xC1) /* C0, C1 */`
			`following = 1;`
			`else return -1;`
			`else if ((buf[i] & 0x10) == 0) /* 1110xxxx */`
			`following = 2;`
			`else if ((buf[i] & 0x08) == 0) /* 11110xxx */`
			`if (buf[i] < 0xF5)`
			`following = 3;`
			`else return -1; /* F5, F6, F7 */`
			`else`
			`return -1; /* F8~FF */`

			`for (n = 0; n < following; n++) {`
			`i++;`
			`if (i >= nbytes)`
			`goto done;`

			`if (buf[i] & 0x40) /* 10xxxxxx */`
			`return -1;`
			`}`

			`gotone = 1;`
			`}`
			`}`
			`done:`
			`return ctrl ? 0 : (gotone ? 2 : 1);`
			`}`

			`/*`
			`* looks_utf16 --`
			`* Decide whether some text looks like UTF-16. Returns:`
			`*`
			`* 0: invalid UTF-16`
			`* 1: Little-endian UTF-16`
			`* 2: Big-endian UTF-16`
			`*`
			`* PUBLIC: int looks_utf16 __P((const char *, size_t));`
			`*/`
			`int`
			`looks_utf16(const char *ibuf, size_t nbytes)`
			`{`
			`const u_char buf = (u_char )ibuf;`
			`int bigend;`
			`size_t i;`
			`unsigned int c;`
			`int bom;`
			`int following = 0;`

			`if (nbytes < 2)`
			`return 0;`

			`bom = buf[0] << 8 ^ buf[1];`
			`if (bom == 0xFFFE)`
			`bigend = 0;`
			`else if (bom == 0xFEFF)`
			`bigend = 1;`
			`else`
			`return 0;`

			`for (i = 2; i + 1 < nbytes; i += 2) {`
			`if (bigend)`
			`c = buf[i] << 8 ^ buf[i + 1];`
			`else`
			`c = buf[i] ^ buf[i + 1] << 8;`

			`if (!following)`
			`if (c < 0xD800 \|\| c > 0xDFFF)`
			`if (c < 128 && text_chars[c] != T)`
			`return 0;`
			`else`
			`following = 0;`
			`else if (c > 0xDBFF)`
			`return 0;`
			`else {`
			`following = 1;`
			`continue;`
			`}`
			`else if (c < 0xDC00 \|\| c > 0xDFFF)`
			`return 0;`
			`}`

			`return 1 + bigend;`
			`}`

			`#undef F`
			`#undef T`
			`#undef I`
			`#undef X`

			`/*`
			`* decode_utf8 --`
			`* Decode a UTF-8 character from byte string to Unicode.`
			`* Returns -1 if the first byte is a not UTF-8 leader.`
			`*`
			`* Based on RFC 3629, but without error detection.`
			`*`
			`* PUBLIC: int decode_utf8 __P((const char *));`
			`*/`
			`int decode_utf8(const char *ibuf) {`
			`const u_char buf = (u_char )ibuf;`
			`int u = -1;`

			`if ((buf[0] & 0x80) == 0)`
			`u = buf[0];`
			`else if ((buf[0] & 0x40) == 0);`
			`else {`
			`if ((buf[0] & 0x20) == 0)`
			`u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80);`
			`else if ((buf[0] & 0x10) == 0)`
			`u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6`
			`^ (buf[2] ^ 0x80);`
			`else if (((buf[0] & 0x08) == 0))`
			`u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12`
			`^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80);`
			`}`
			`return u;`
			`}`

			`/*`
			`* decode_utf16 --`
			`* Decode a UTF-16 character from byte string to Unicode.`
			`* Returns -1 if the first unsigned integer is invalid.`
			`*`
			`* No error detection on supplementary bytes.`
			`*`
			`* PUBLIC: int decode_utf16 __P((const char *, int));`
			`*/`
			`int decode_utf16(const char* ibuf, int bigend) {`
			`const u_char buf = (u_char )ibuf;`
			`int u = -1;`
			`unsigned int w1, w2;`

			`if (bigend)`
			`w1 = buf[0] << 8 ^ buf[1];`
			`else`
			`w1 = buf[0] ^ buf[1] << 8;`

			`if (w1 < 0xD800 \|\| w1 > 0xDFFF)`
			`u = w1;`
			`else if (w1 > 0xDBFF);`
			`else {`
			`if (bigend)`
			`w2 = buf[2] << 8 ^ buf[3];`
			`else`
			`w2 = buf[2] ^ buf[3] << 8;`
			`u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;`
			`}`
			`return u;`
			`}`