yuripv 85096f812f vi: fix UTF-8 detection.
PR:		202290
Submitted by:	lampa@fit.vutbr.cz
Reviewed by:	bapt
Approved by:	kib (mentor, implicit)
MFC after:	3 days
Differential Revision:	https://reviews.freebsd.org/D17950
2018-11-26 15:33:55 +00:00

237 lines
5.6 KiB
C

/*-
* Copyright (c) 2011, 2012
* Zhihao Yuan. All rights reserved.
*
* See the LICENSE file for redistribution information.
*/
#ifndef lint
static const char sccsid[] = "$Id: encoding.c,v 1.4 2011/12/13 19:40:52 zy Exp $";
#endif /* not lint */
#include <sys/types.h>
int looks_utf8(const char *, size_t);
int looks_utf16(const char *, size_t);
int decode_utf8(const char *);
int decode_utf16(const char *, int);
#define F 0 /* character never appears in text */
#define T 1 /* character appears in plain ASCII text */
#define I 2 /* character appears in ISO-8859 text */
#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */
static char text_chars[256] = {
/* BEL BS HT LF FF CR */
F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */
/* ESC */
F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */
T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */
/* NEL */
X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */
X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */
I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */
};
/*
* looks_utf8 --
* Decide whether some text looks like UTF-8. Returns:
*
* -1: invalid UTF-8
* 0: uses odd control characters, so doesn't look like text
* 1: 7-bit text
* 2: definitely UTF-8 text (valid high-bit set bytes)
*
* Based on RFC 3629. UTF-8 with BOM is not accepted.
*
* PUBLIC: int looks_utf8(const char *, size_t);
*/
int
looks_utf8(const char *ibuf, size_t nbytes)
{
const u_char *buf = (u_char *)ibuf;
size_t i;
int n;
int gotone = 0, ctrl = 0;
for (i = 0; i < nbytes; i++) {
if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
/*
* Even if the whole file is valid UTF-8 sequences,
* still reject it if it uses weird control characters.
*/
if (text_chars[buf[i]] != T)
ctrl = 1;
} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
return -1;
} else { /* 11xxxxxx begins UTF-8 */
int following;
if ((buf[i] & 0x20) == 0) /* 110xxxxx */
if (buf[i] > 0xC1) /* C0, C1 */
following = 1;
else return -1;
else if ((buf[i] & 0x10) == 0) /* 1110xxxx */
following = 2;
else if ((buf[i] & 0x08) == 0) /* 11110xxx */
if (buf[i] < 0xF5)
following = 3;
else return -1; /* F5, F6, F7 */
else
return -1; /* F8~FF */
for (n = 0; n < following; n++) {
i++;
if (i >= nbytes)
goto done;
if ((buf[i] & 0xc0) != 0x80) /* 10xxxxxx */
return -1;
}
gotone = 1;
}
}
done:
return ctrl ? 0 : (gotone ? 2 : 1);
}
/*
* looks_utf16 --
* Decide whether some text looks like UTF-16. Returns:
*
* 0: invalid UTF-16
* 1: Little-endian UTF-16
* 2: Big-endian UTF-16
*
* PUBLIC: int looks_utf16(const char *, size_t);
*/
int
looks_utf16(const char *ibuf, size_t nbytes)
{
const u_char *buf = (u_char *)ibuf;
int bigend;
size_t i;
unsigned int c;
int bom;
int following = 0;
if (nbytes < 2)
return 0;
bom = buf[0] << 8 ^ buf[1];
if (bom == 0xFFFE)
bigend = 0;
else if (bom == 0xFEFF)
bigend = 1;
else
return 0;
for (i = 2; i + 1 < nbytes; i += 2) {
if (bigend)
c = buf[i] << 8 ^ buf[i + 1];
else
c = buf[i] ^ buf[i + 1] << 8;
if (!following)
if (c < 0xD800 || c > 0xDFFF)
if (c < 128 && text_chars[c] != T)
return 0;
else
following = 0;
else if (c > 0xDBFF)
return 0;
else {
following = 1;
continue;
}
else if (c < 0xDC00 || c > 0xDFFF)
return 0;
}
return 1 + bigend;
}
#undef F
#undef T
#undef I
#undef X
/*
* decode_utf8 --
* Decode a UTF-8 character from byte string to Unicode.
* Returns -1 if the first byte is a not UTF-8 leader.
*
* Based on RFC 3629, but without error detection.
*
* PUBLIC: int decode_utf8(const char *);
*/
int
decode_utf8(const char *ibuf)
{
const u_char *buf = (u_char *)ibuf;
int u = -1;
if ((buf[0] & 0x80) == 0)
u = buf[0];
else if ((buf[0] & 0x40) == 0);
else {
if ((buf[0] & 0x20) == 0)
u = (buf[0] ^ 0xC0) << 6 ^ (buf[1] ^ 0x80);
else if ((buf[0] & 0x10) == 0)
u = (buf[0] ^ 0xE0) << 12 ^ (buf[1] ^ 0x80) << 6
^ (buf[2] ^ 0x80);
else if (((buf[0] & 0x08) == 0))
u = (buf[0] ^ 0xF0) << 18 ^ (buf[1] ^ 0x80) << 12
^ (buf[2] ^ 0x80) << 6 ^ (buf[3] ^ 0x80);
}
return u;
}
/*
* decode_utf16 --
* Decode a UTF-16 character from byte string to Unicode.
* Returns -1 if the first unsigned integer is invalid.
*
* No error detection on supplementary bytes.
*
* PUBLIC: int decode_utf16(const char *, int);
*/
int
decode_utf16(const char* ibuf, int bigend)
{
const u_char *buf = (u_char *)ibuf;
int u = -1;
unsigned int w1, w2;
if (bigend)
w1 = buf[0] << 8 ^ buf[1];
else
w1 = buf[0] ^ buf[1] << 8;
if (w1 < 0xD800 || w1 > 0xDFFF)
u = w1;
else if (w1 > 0xDBFF);
else {
if (bigend)
w2 = buf[2] << 8 ^ buf[3];
else
w2 = buf[2] ^ buf[3] << 8;
u = ((w1 ^ 0xD800) << 10 ^ (w2 ^ 0xDC00)) + 0x10000;
}
return u;
}