1ea316270f
compiler-time type checking, and also causes problems for targets where multiple incompatible calling conventions may be selected based on argument types. This change switches less(1) to ANSI prototypes. While there, we also remove use of "register", and attempt to use "const" a bit better now that the compiler can check argument types. Reviewed by: cem, emaste MFC after: 3 weeks Sponsored by: DARPA, AFRL Differential Revision: https://reviews.freebsd.org/D10152
796 lines
16 KiB
C
796 lines
16 KiB
C
/*
|
|
* Copyright (C) 1984-2015 Mark Nudelman
|
|
*
|
|
* You may distribute under the terms of either the GNU General Public
|
|
* License or the Less License, as specified in the README file.
|
|
*
|
|
* For more information, see the README file.
|
|
*/
|
|
|
|
|
|
/*
|
|
* Functions to define the character set
|
|
* and do things specific to the character set.
|
|
*/
|
|
|
|
#include "less.h"
|
|
#if HAVE_LOCALE
|
|
#include <locale.h>
|
|
#include <ctype.h>
|
|
#include <langinfo.h>
|
|
#endif
|
|
|
|
#include "charset.h"
|
|
|
|
public int utf_mode = 0;
|
|
|
|
/*
|
|
* Predefined character sets,
|
|
* selected by the LESSCHARSET environment variable.
|
|
*/
|
|
struct charset {
|
|
char *name;
|
|
int *p_flag;
|
|
char *desc;
|
|
} charsets[] = {
|
|
{ "ascii", NULL, "8bcccbcc18b95.b" },
|
|
{ "utf-8", &utf_mode, "8bcccbcc18b95.b126.bb" },
|
|
{ "iso8859", NULL, "8bcccbcc18b95.33b." },
|
|
{ "latin3", NULL, "8bcccbcc18b95.33b5.b8.b15.b4.b12.b18.b12.b." },
|
|
{ "arabic", NULL, "8bcccbcc18b95.33b.3b.7b2.13b.3b.b26.5b19.b" },
|
|
{ "greek", NULL, "8bcccbcc18b95.33b4.2b4.b3.b35.b44.b" },
|
|
{ "greek2005", NULL, "8bcccbcc18b95.33b14.b35.b44.b" },
|
|
{ "hebrew", NULL, "8bcccbcc18b95.33b.b29.32b28.2b2.b" },
|
|
{ "koi8-r", NULL, "8bcccbcc18b95.b." },
|
|
{ "KOI8-T", NULL, "8bcccbcc18b95.b8.b6.b8.b.b.5b7.3b4.b4.b3.b.b.3b." },
|
|
{ "georgianps", NULL, "8bcccbcc18b95.3b11.4b12.2b." },
|
|
{ "tcvn", NULL, "b..b...bcccbccbbb7.8b95.b48.5b." },
|
|
{ "TIS-620", NULL, "8bcccbcc18b95.b.4b.11b7.8b." },
|
|
{ "next", NULL, "8bcccbcc18b95.bb125.bb" },
|
|
{ "dos", NULL, "8bcccbcc12bc5b95.b." },
|
|
{ "windows-1251", NULL, "8bcccbcc12bc5b95.b24.b." },
|
|
{ "windows-1252", NULL, "8bcccbcc12bc5b95.b.b11.b.2b12.b." },
|
|
{ "windows-1255", NULL, "8bcccbcc12bc5b95.b.b8.b.5b9.b.4b." },
|
|
{ "ebcdic", NULL, "5bc6bcc7bcc41b.9b7.9b5.b..8b6.10b6.b9.7b9.8b8.17b3.3b9.7b9.8b8.6b10.b.b.b." },
|
|
{ "IBM-1047", NULL, "4cbcbc3b9cbccbccbb4c6bcc5b3cbbc4bc4bccbc191.b" },
|
|
{ NULL, NULL, NULL }
|
|
};
|
|
|
|
/*
|
|
* Support "locale charmap"/nl_langinfo(CODESET) values, as well as others.
|
|
*/
|
|
struct cs_alias {
|
|
char *name;
|
|
char *oname;
|
|
} cs_aliases[] = {
|
|
{ "UTF-8", "utf-8" },
|
|
{ "ANSI_X3.4-1968", "ascii" },
|
|
{ "US-ASCII", "ascii" },
|
|
{ "latin1", "iso8859" },
|
|
{ "ISO-8859-1", "iso8859" },
|
|
{ "latin9", "iso8859" },
|
|
{ "ISO-8859-15", "iso8859" },
|
|
{ "latin2", "iso8859" },
|
|
{ "ISO-8859-2", "iso8859" },
|
|
{ "ISO-8859-3", "latin3" },
|
|
{ "latin4", "iso8859" },
|
|
{ "ISO-8859-4", "iso8859" },
|
|
{ "cyrillic", "iso8859" },
|
|
{ "ISO-8859-5", "iso8859" },
|
|
{ "ISO-8859-6", "arabic" },
|
|
{ "ISO-8859-7", "greek" },
|
|
{ "IBM9005", "greek2005" },
|
|
{ "ISO-8859-8", "hebrew" },
|
|
{ "latin5", "iso8859" },
|
|
{ "ISO-8859-9", "iso8859" },
|
|
{ "latin6", "iso8859" },
|
|
{ "ISO-8859-10", "iso8859" },
|
|
{ "latin7", "iso8859" },
|
|
{ "ISO-8859-13", "iso8859" },
|
|
{ "latin8", "iso8859" },
|
|
{ "ISO-8859-14", "iso8859" },
|
|
{ "latin10", "iso8859" },
|
|
{ "ISO-8859-16", "iso8859" },
|
|
{ "IBM437", "dos" },
|
|
{ "EBCDIC-US", "ebcdic" },
|
|
{ "IBM1047", "IBM-1047" },
|
|
{ "KOI8-R", "koi8-r" },
|
|
{ "KOI8-U", "koi8-r" },
|
|
{ "GEORGIAN-PS", "georgianps" },
|
|
{ "TCVN5712-1", "tcvn" },
|
|
{ "NEXTSTEP", "next" },
|
|
{ "windows", "windows-1252" }, /* backward compatibility */
|
|
{ "CP1251", "windows-1251" },
|
|
{ "CP1252", "windows-1252" },
|
|
{ "CP1255", "windows-1255" },
|
|
{ NULL, NULL }
|
|
};
|
|
|
|
#define IS_BINARY_CHAR 01
|
|
#define IS_CONTROL_CHAR 02
|
|
|
|
static char chardef[256];
|
|
static char *binfmt = NULL;
|
|
static char *utfbinfmt = NULL;
|
|
public int binattr = AT_STANDOUT;
|
|
|
|
|
|
/*
|
|
* Define a charset, given a description string.
|
|
* The string consists of 256 letters,
|
|
* one for each character in the charset.
|
|
* If the string is shorter than 256 letters, missing letters
|
|
* are taken to be identical to the last one.
|
|
* A decimal number followed by a letter is taken to be a
|
|
* repetition of the letter.
|
|
*
|
|
* Each letter is one of:
|
|
* . normal character
|
|
* b binary character
|
|
* c control character
|
|
*/
|
|
static void
|
|
ichardef(char *s)
|
|
{
|
|
char *cp;
|
|
int n;
|
|
char v;
|
|
|
|
n = 0;
|
|
v = 0;
|
|
cp = chardef;
|
|
while (*s != '\0')
|
|
{
|
|
switch (*s++)
|
|
{
|
|
case '.':
|
|
v = 0;
|
|
break;
|
|
case 'c':
|
|
v = IS_CONTROL_CHAR;
|
|
break;
|
|
case 'b':
|
|
v = IS_BINARY_CHAR|IS_CONTROL_CHAR;
|
|
break;
|
|
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9':
|
|
n = (10 * n) + (s[-1] - '0');
|
|
continue;
|
|
|
|
default:
|
|
error("invalid chardef", NULL_PARG);
|
|
quit(QUIT_ERROR);
|
|
/*NOTREACHED*/
|
|
}
|
|
|
|
do
|
|
{
|
|
if (cp >= chardef + sizeof(chardef))
|
|
{
|
|
error("chardef longer than 256", NULL_PARG);
|
|
quit(QUIT_ERROR);
|
|
/*NOTREACHED*/
|
|
}
|
|
*cp++ = v;
|
|
} while (--n > 0);
|
|
n = 0;
|
|
}
|
|
|
|
while (cp < chardef + sizeof(chardef))
|
|
*cp++ = v;
|
|
}
|
|
|
|
/*
|
|
* Define a charset, given a charset name.
|
|
* The valid charset names are listed in the "charsets" array.
|
|
*/
|
|
static int
|
|
icharset(char *name, int no_error)
|
|
{
|
|
struct charset *p;
|
|
struct cs_alias *a;
|
|
|
|
if (name == NULL || *name == '\0')
|
|
return (0);
|
|
|
|
/* First see if the name is an alias. */
|
|
for (a = cs_aliases; a->name != NULL; a++)
|
|
{
|
|
if (strcmp(name, a->name) == 0)
|
|
{
|
|
name = a->oname;
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (p = charsets; p->name != NULL; p++)
|
|
{
|
|
if (strcmp(name, p->name) == 0)
|
|
{
|
|
ichardef(p->desc);
|
|
if (p->p_flag != NULL)
|
|
*(p->p_flag) = 1;
|
|
return (1);
|
|
}
|
|
}
|
|
|
|
if (!no_error) {
|
|
error("invalid charset name", NULL_PARG);
|
|
quit(QUIT_ERROR);
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
#if HAVE_LOCALE
|
|
/*
|
|
* Define a charset, given a locale name.
|
|
*/
|
|
static void
|
|
ilocale(void)
|
|
{
|
|
int c;
|
|
|
|
for (c = 0; c < (int) sizeof(chardef); c++)
|
|
{
|
|
if (isprint(c))
|
|
chardef[c] = 0;
|
|
else if (iscntrl(c))
|
|
chardef[c] = IS_CONTROL_CHAR;
|
|
else
|
|
chardef[c] = IS_BINARY_CHAR|IS_CONTROL_CHAR;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
/*
|
|
* Define the printing format for control (or binary utf) chars.
|
|
*/
|
|
static void
|
|
setbinfmt(char *s, char **fmtvarptr, char *default_fmt)
|
|
{
|
|
if (s && utf_mode)
|
|
{
|
|
/* It would be too hard to account for width otherwise. */
|
|
char *t = s;
|
|
while (*t)
|
|
{
|
|
if (*t < ' ' || *t > '~')
|
|
{
|
|
s = default_fmt;
|
|
goto attr;
|
|
}
|
|
t++;
|
|
}
|
|
}
|
|
|
|
/* %n is evil */
|
|
if (s == NULL || *s == '\0' ||
|
|
(*s == '*' && (s[1] == '\0' || s[2] == '\0' || strchr(s + 2, 'n'))) ||
|
|
(*s != '*' && strchr(s, 'n')))
|
|
s = default_fmt;
|
|
|
|
/*
|
|
* Select the attributes if it starts with "*".
|
|
*/
|
|
attr:
|
|
if (*s == '*')
|
|
{
|
|
switch (s[1])
|
|
{
|
|
case 'd': binattr = AT_BOLD; break;
|
|
case 'k': binattr = AT_BLINK; break;
|
|
case 's': binattr = AT_STANDOUT; break;
|
|
case 'u': binattr = AT_UNDERLINE; break;
|
|
default: binattr = AT_NORMAL; break;
|
|
}
|
|
s += 2;
|
|
}
|
|
*fmtvarptr = s;
|
|
}
|
|
|
|
/*
|
|
*
|
|
*/
|
|
static void
|
|
set_charset(void)
|
|
{
|
|
char *s;
|
|
|
|
/*
|
|
* See if environment variable LESSCHARSET is defined.
|
|
*/
|
|
s = lgetenv("LESSCHARSET");
|
|
if (icharset(s, 0))
|
|
return;
|
|
|
|
/*
|
|
* LESSCHARSET is not defined: try LESSCHARDEF.
|
|
*/
|
|
s = lgetenv("LESSCHARDEF");
|
|
if (s != NULL && *s != '\0')
|
|
{
|
|
ichardef(s);
|
|
return;
|
|
}
|
|
|
|
#if HAVE_LOCALE
|
|
#ifdef CODESET
|
|
/*
|
|
* Try using the codeset name as the charset name.
|
|
*/
|
|
s = nl_langinfo(CODESET);
|
|
if (icharset(s, 1))
|
|
return;
|
|
#endif
|
|
#endif
|
|
|
|
#if HAVE_STRSTR
|
|
/*
|
|
* Check whether LC_ALL, LC_CTYPE or LANG look like UTF-8 is used.
|
|
*/
|
|
if ((s = lgetenv("LC_ALL")) != NULL ||
|
|
(s = lgetenv("LC_CTYPE")) != NULL ||
|
|
(s = lgetenv("LANG")) != NULL)
|
|
{
|
|
if ( strstr(s, "UTF-8") != NULL || strstr(s, "utf-8") != NULL
|
|
|| strstr(s, "UTF8") != NULL || strstr(s, "utf8") != NULL)
|
|
if (icharset("utf-8", 1))
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#if HAVE_LOCALE
|
|
/*
|
|
* Get character definitions from locale functions,
|
|
* rather than from predefined charset entry.
|
|
*/
|
|
ilocale();
|
|
#if MSDOS_COMPILER
|
|
/*
|
|
* Default to "dos".
|
|
*/
|
|
(void) icharset("dos", 1);
|
|
#else
|
|
/*
|
|
* Default to "latin1".
|
|
*/
|
|
(void) icharset("latin1", 1);
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* Initialize charset data structures.
|
|
*/
|
|
public void
|
|
init_charset(void)
|
|
{
|
|
char *s;
|
|
|
|
#if HAVE_LOCALE
|
|
setlocale(LC_ALL, "");
|
|
#endif
|
|
|
|
set_charset();
|
|
|
|
s = lgetenv("LESSBINFMT");
|
|
setbinfmt(s, &binfmt, "*s<%02X>");
|
|
|
|
s = lgetenv("LESSUTFBINFMT");
|
|
setbinfmt(s, &utfbinfmt, "<U+%04lX>");
|
|
}
|
|
|
|
/*
|
|
* Is a given character a "binary" character?
|
|
*/
|
|
public int
|
|
binary_char(LWCHAR c)
|
|
{
|
|
if (utf_mode)
|
|
return (is_ubin_char(c));
|
|
c &= 0377;
|
|
return (chardef[c] & IS_BINARY_CHAR);
|
|
}
|
|
|
|
/*
|
|
* Is a given character a "control" character?
|
|
*/
|
|
public int
|
|
control_char(LWCHAR c)
|
|
{
|
|
c &= 0377;
|
|
return (chardef[c] & IS_CONTROL_CHAR);
|
|
}
|
|
|
|
/*
|
|
* Return the printable form of a character.
|
|
* For example, in the "ascii" charset '\3' is printed as "^C".
|
|
*/
|
|
public char *
|
|
prchar(LWCHAR c)
|
|
{
|
|
/* {{ This buffer can be overrun if LESSBINFMT is a long string. }} */
|
|
static char buf[32];
|
|
|
|
c &= 0377;
|
|
if ((c < 128 || !utf_mode) && !control_char(c))
|
|
SNPRINTF1(buf, sizeof(buf), "%c", (int) c);
|
|
else if (c == ESC)
|
|
strcpy(buf, "ESC");
|
|
#if IS_EBCDIC_HOST
|
|
else if (!binary_char(c) && c < 64)
|
|
SNPRINTF1(buf, sizeof(buf), "^%c",
|
|
/*
|
|
* This array roughly inverts CONTROL() #defined in less.h,
|
|
* and should be kept in sync with CONTROL() and IBM-1047.
|
|
*/
|
|
"@ABC.I.?...KLMNO"
|
|
"PQRS.JH.XY.."
|
|
"\\]^_"
|
|
"......W[.....EFG"
|
|
"..V....D....TU.Z"[c]);
|
|
#else
|
|
else if (c < 128 && !control_char(c ^ 0100))
|
|
SNPRINTF1(buf, sizeof(buf), "^%c", (int) (c ^ 0100));
|
|
#endif
|
|
else
|
|
SNPRINTF1(buf, sizeof(buf), binfmt, c);
|
|
return (buf);
|
|
}
|
|
|
|
/*
|
|
* Return the printable form of a UTF-8 character.
|
|
*/
|
|
public char *
|
|
prutfchar(LWCHAR ch)
|
|
{
|
|
static char buf[32];
|
|
|
|
if (ch == ESC)
|
|
strcpy(buf, "ESC");
|
|
else if (ch < 128 && control_char(ch))
|
|
{
|
|
if (!control_char(ch ^ 0100))
|
|
SNPRINTF1(buf, sizeof(buf), "^%c", ((char) ch) ^ 0100);
|
|
else
|
|
SNPRINTF1(buf, sizeof(buf), binfmt, (char) ch);
|
|
} else if (is_ubin_char(ch))
|
|
{
|
|
SNPRINTF1(buf, sizeof(buf), utfbinfmt, ch);
|
|
} else
|
|
{
|
|
char *p = buf;
|
|
if (ch >= 0x80000000)
|
|
ch = 0xFFFD; /* REPLACEMENT CHARACTER */
|
|
put_wchar(&p, ch);
|
|
*p = '\0';
|
|
}
|
|
return (buf);
|
|
}
|
|
|
|
/*
|
|
* Get the length of a UTF-8 character in bytes.
|
|
*/
|
|
public int
|
|
utf_len(char ch)
|
|
{
|
|
if ((ch & 0x80) == 0)
|
|
return 1;
|
|
if ((ch & 0xE0) == 0xC0)
|
|
return 2;
|
|
if ((ch & 0xF0) == 0xE0)
|
|
return 3;
|
|
if ((ch & 0xF8) == 0xF0)
|
|
return 4;
|
|
if ((ch & 0xFC) == 0xF8)
|
|
return 5;
|
|
if ((ch & 0xFE) == 0xFC)
|
|
return 6;
|
|
/* Invalid UTF-8 encoding. */
|
|
return 1;
|
|
}
|
|
|
|
/*
|
|
* Does the parameter point to the lead byte of a well-formed UTF-8 character?
|
|
*/
|
|
public int
|
|
is_utf8_well_formed(unsigned char *s, int slen)
|
|
{
|
|
int i;
|
|
int len;
|
|
|
|
if (IS_UTF8_INVALID(s[0]))
|
|
return (0);
|
|
|
|
len = utf_len((char) s[0]);
|
|
if (len > slen)
|
|
return (0);
|
|
if (len == 1)
|
|
return (1);
|
|
if (len == 2)
|
|
{
|
|
if (s[0] < 0xC2)
|
|
return (0);
|
|
} else
|
|
{
|
|
unsigned char mask;
|
|
mask = (~((1 << (8-len)) - 1)) & 0xFF;
|
|
if (s[0] == mask && (s[1] & mask) == 0x80)
|
|
return (0);
|
|
}
|
|
|
|
for (i = 1; i < len; i++)
|
|
if (!IS_UTF8_TRAIL(s[i]))
|
|
return (0);
|
|
return (1);
|
|
}
|
|
|
|
/*
|
|
* Return number of invalid UTF-8 sequences found in a buffer.
|
|
*/
|
|
public int
|
|
utf_bin_count(unsigned char *data, int len)
|
|
{
|
|
int bin_count = 0;
|
|
while (len > 0)
|
|
{
|
|
if (is_utf8_well_formed(data, len))
|
|
{
|
|
int clen = utf_len(*data);
|
|
data += clen;
|
|
len -= clen;
|
|
} else
|
|
{
|
|
/* Skip to next lead byte. */
|
|
bin_count++;
|
|
do {
|
|
++data;
|
|
--len;
|
|
} while (len > 0 && !IS_UTF8_LEAD(*data));
|
|
}
|
|
}
|
|
return (bin_count);
|
|
}
|
|
|
|
/*
|
|
* Get the value of a UTF-8 character.
|
|
*/
|
|
public LWCHAR
|
|
get_wchar(constant char *p)
|
|
{
|
|
switch (utf_len(p[0]))
|
|
{
|
|
case 1:
|
|
default:
|
|
/* 0xxxxxxx */
|
|
return (LWCHAR)
|
|
(p[0] & 0xFF);
|
|
case 2:
|
|
/* 110xxxxx 10xxxxxx */
|
|
return (LWCHAR) (
|
|
((p[0] & 0x1F) << 6) |
|
|
(p[1] & 0x3F));
|
|
case 3:
|
|
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
|
return (LWCHAR) (
|
|
((p[0] & 0x0F) << 12) |
|
|
((p[1] & 0x3F) << 6) |
|
|
(p[2] & 0x3F));
|
|
case 4:
|
|
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
return (LWCHAR) (
|
|
((p[0] & 0x07) << 18) |
|
|
((p[1] & 0x3F) << 12) |
|
|
((p[2] & 0x3F) << 6) |
|
|
(p[3] & 0x3F));
|
|
case 5:
|
|
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
return (LWCHAR) (
|
|
((p[0] & 0x03) << 24) |
|
|
((p[1] & 0x3F) << 18) |
|
|
((p[2] & 0x3F) << 12) |
|
|
((p[3] & 0x3F) << 6) |
|
|
(p[4] & 0x3F));
|
|
case 6:
|
|
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
return (LWCHAR) (
|
|
((p[0] & 0x01) << 30) |
|
|
((p[1] & 0x3F) << 24) |
|
|
((p[2] & 0x3F) << 18) |
|
|
((p[3] & 0x3F) << 12) |
|
|
((p[4] & 0x3F) << 6) |
|
|
(p[5] & 0x3F));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Store a character into a UTF-8 string.
|
|
*/
|
|
public void
|
|
put_wchar(char **pp, LWCHAR ch)
|
|
{
|
|
if (!utf_mode || ch < 0x80)
|
|
{
|
|
/* 0xxxxxxx */
|
|
*(*pp)++ = (char) ch;
|
|
} else if (ch < 0x800)
|
|
{
|
|
/* 110xxxxx 10xxxxxx */
|
|
*(*pp)++ = (char) (0xC0 | ((ch >> 6) & 0x1F));
|
|
*(*pp)++ = (char) (0x80 | (ch & 0x3F));
|
|
} else if (ch < 0x10000)
|
|
{
|
|
/* 1110xxxx 10xxxxxx 10xxxxxx */
|
|
*(*pp)++ = (char) (0xE0 | ((ch >> 12) & 0x0F));
|
|
*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
|
|
*(*pp)++ = (char) (0x80 | (ch & 0x3F));
|
|
} else if (ch < 0x200000)
|
|
{
|
|
/* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
*(*pp)++ = (char) (0xF0 | ((ch >> 18) & 0x07));
|
|
*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
|
|
*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
|
|
*(*pp)++ = (char) (0x80 | (ch & 0x3F));
|
|
} else if (ch < 0x4000000)
|
|
{
|
|
/* 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
*(*pp)++ = (char) (0xF0 | ((ch >> 24) & 0x03));
|
|
*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
|
|
*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
|
|
*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
|
|
*(*pp)++ = (char) (0x80 | (ch & 0x3F));
|
|
} else
|
|
{
|
|
/* 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
|
*(*pp)++ = (char) (0xF0 | ((ch >> 30) & 0x01));
|
|
*(*pp)++ = (char) (0x80 | ((ch >> 24) & 0x3F));
|
|
*(*pp)++ = (char) (0x80 | ((ch >> 18) & 0x3F));
|
|
*(*pp)++ = (char) (0x80 | ((ch >> 12) & 0x3F));
|
|
*(*pp)++ = (char) (0x80 | ((ch >> 6) & 0x3F));
|
|
*(*pp)++ = (char) (0x80 | (ch & 0x3F));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Step forward or backward one character in a string.
|
|
*/
|
|
public LWCHAR
|
|
step_char(constant char **pp, signed int dir, constant char *limit)
|
|
{
|
|
LWCHAR ch;
|
|
int len;
|
|
constant char *p = *pp;
|
|
|
|
if (!utf_mode)
|
|
{
|
|
/* It's easy if chars are one byte. */
|
|
if (dir > 0)
|
|
ch = (LWCHAR) ((p < limit) ? *p++ : 0);
|
|
else
|
|
ch = (LWCHAR) ((p > limit) ? *--p : 0);
|
|
} else if (dir > 0)
|
|
{
|
|
len = utf_len(*p);
|
|
if (p + len > limit)
|
|
{
|
|
ch = 0;
|
|
p = limit;
|
|
} else
|
|
{
|
|
ch = get_wchar(p);
|
|
p += len;
|
|
}
|
|
} else
|
|
{
|
|
while (p > limit && IS_UTF8_TRAIL(p[-1]))
|
|
p--;
|
|
if (p > limit)
|
|
ch = get_wchar(--p);
|
|
else
|
|
ch = 0;
|
|
}
|
|
*pp = p;
|
|
return ch;
|
|
}
|
|
|
|
/*
|
|
* Unicode characters data
|
|
* Actual data is in the generated *.uni files.
|
|
*/
|
|
|
|
#define DECLARE_RANGE_TABLE_START(name) \
|
|
static struct wchar_range name##_array[] = {
|
|
#define DECLARE_RANGE_TABLE_END(name) \
|
|
}; struct wchar_range_table name##_table = { name##_array, sizeof(name##_array)/sizeof(*name##_array) };
|
|
|
|
DECLARE_RANGE_TABLE_START(compose)
|
|
#include "compose.uni"
|
|
DECLARE_RANGE_TABLE_END(compose)
|
|
|
|
DECLARE_RANGE_TABLE_START(ubin)
|
|
#include "ubin.uni"
|
|
DECLARE_RANGE_TABLE_END(ubin)
|
|
|
|
DECLARE_RANGE_TABLE_START(wide)
|
|
#include "wide.uni"
|
|
DECLARE_RANGE_TABLE_END(wide)
|
|
|
|
/* comb_table is special pairs, not ranges. */
|
|
static struct wchar_range comb_table[] = {
|
|
{0x0644,0x0622}, {0x0644,0x0623}, {0x0644,0x0625}, {0x0644,0x0627},
|
|
};
|
|
|
|
|
|
static int
|
|
is_in_table(LWCHAR ch, struct wchar_range_table *table)
|
|
{
|
|
int hi;
|
|
int lo;
|
|
|
|
/* Binary search in the table. */
|
|
if (ch < table->table[0].first)
|
|
return 0;
|
|
lo = 0;
|
|
hi = table->count - 1;
|
|
while (lo <= hi)
|
|
{
|
|
int mid = (lo + hi) / 2;
|
|
if (ch > table->table[mid].last)
|
|
lo = mid + 1;
|
|
else if (ch < table->table[mid].first)
|
|
hi = mid - 1;
|
|
else
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Is a character a UTF-8 composing character?
|
|
* If a composing character follows any char, the two combine into one glyph.
|
|
*/
|
|
public int
|
|
is_composing_char(LWCHAR ch)
|
|
{
|
|
return is_in_table(ch, &compose_table);
|
|
}
|
|
|
|
/*
|
|
* Should this UTF-8 character be treated as binary?
|
|
*/
|
|
public int
|
|
is_ubin_char(LWCHAR ch)
|
|
{
|
|
return is_in_table(ch, &ubin_table);
|
|
}
|
|
|
|
/*
|
|
* Is this a double width UTF-8 character?
|
|
*/
|
|
public int
|
|
is_wide_char(LWCHAR ch)
|
|
{
|
|
return is_in_table(ch, &wide_table);
|
|
}
|
|
|
|
/*
|
|
* Is a character a UTF-8 combining character?
|
|
* A combining char acts like an ordinary char, but if it follows
|
|
* a specific char (not any char), the two combine into one glyph.
|
|
*/
|
|
public int
|
|
is_combining_char(LWCHAR ch1, LWCHAR ch2)
|
|
{
|
|
/* The table is small; use linear search. */
|
|
int i;
|
|
for (i = 0; i < sizeof(comb_table)/sizeof(*comb_table); i++)
|
|
{
|
|
if (ch1 == comb_table[i].first &&
|
|
ch2 == comb_table[i].last)
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|