freebsd-dev/usr.bin/localedef/scanner.c
Baptiste Daroussin 057ca2d437 Add localedef(1), a locale definition generator tool
The localedef tool can read entire (and unmodified) CLDR posix definition
files, and generate all 6 LC categories: LC_COLLATE, LC_CTYPE, LC_TIME,
LC_NUMERIC, LC_MONETARY and LC_MESSAGES.

This tool has a long history with Solaris.  The Nexenta developers
modified it to read CLDR files and created the much richer collation
formats.  The libc collation functions have to be modified to read the
new format (called "BSD-1.0") and to handle the new data structures.

The result will be that locale-sensitive tools and functions will now
properly sort multibyte and unicode strings.

Obtained from:	Dragonfly
2015-08-07 23:53:31 +00:00

867 lines
17 KiB
C

/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This file contains the "scanner", which tokenizes the input files
* for localedef for processing by the higher level grammar processor.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <limits.h>
#include <string.h>
#include <wchar.h>
#include <sys/types.h>
#include <assert.h>
#include "localedef.h"
#include "parser.h"
int com_char = '#';
int esc_char = '\\';
int mb_cur_min = 1;
int mb_cur_max = 1;
int lineno = 1;
int warnings = 0;
int is_stdin = 1;
FILE *input;
static int nextline;
//static FILE *input = stdin;
static const char *filename = "<stdin>";
static int instring = 0;
static int escaped = 0;
/*
* Token space ... grows on demand.
*/
static char *token = NULL;
static int tokidx;
static int toksz = 0;
static int hadtok = 0;
/*
* Wide string space ... grows on demand.
*/
static wchar_t *widestr = NULL;
static int wideidx = 0;
static int widesz = 0;
/*
* The last keyword seen. This is useful to trigger the special lexer rules
* for "copy" and also collating symbols and elements.
*/
int last_kw = 0;
static int category = T_END;
static struct token {
int id;
const char *name;
} keywords[] = {
{ T_COM_CHAR, "comment_char" },
{ T_ESC_CHAR, "escape_char" },
{ T_END, "END" },
{ T_COPY, "copy" },
{ T_MESSAGES, "LC_MESSAGES" },
{ T_YESSTR, "yesstr" },
{ T_YESEXPR, "yesexpr" },
{ T_NOSTR, "nostr" },
{ T_NOEXPR, "noexpr" },
{ T_MONETARY, "LC_MONETARY" },
{ T_INT_CURR_SYMBOL, "int_curr_symbol" },
{ T_CURRENCY_SYMBOL, "currency_symbol" },
{ T_MON_DECIMAL_POINT, "mon_decimal_point" },
{ T_MON_THOUSANDS_SEP, "mon_thousands_sep" },
{ T_POSITIVE_SIGN, "positive_sign" },
{ T_NEGATIVE_SIGN, "negative_sign" },
{ T_MON_GROUPING, "mon_grouping" },
{ T_INT_FRAC_DIGITS, "int_frac_digits" },
{ T_FRAC_DIGITS, "frac_digits" },
{ T_P_CS_PRECEDES, "p_cs_precedes" },
{ T_P_SEP_BY_SPACE, "p_sep_by_space" },
{ T_N_CS_PRECEDES, "n_cs_precedes" },
{ T_N_SEP_BY_SPACE, "n_sep_by_space" },
{ T_P_SIGN_POSN, "p_sign_posn" },
{ T_N_SIGN_POSN, "n_sign_posn" },
{ T_INT_P_CS_PRECEDES, "int_p_cs_precedes" },
{ T_INT_N_CS_PRECEDES, "int_n_cs_precedes" },
{ T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" },
{ T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" },
{ T_INT_P_SIGN_POSN, "int_p_sign_posn" },
{ T_INT_N_SIGN_POSN, "int_n_sign_posn" },
{ T_COLLATE, "LC_COLLATE" },
{ T_COLLATING_SYMBOL, "collating-symbol" },
{ T_COLLATING_ELEMENT, "collating-element" },
{ T_FROM, "from" },
{ T_ORDER_START, "order_start" },
{ T_ORDER_END, "order_end" },
{ T_FORWARD, "forward" },
{ T_BACKWARD, "backward" },
{ T_POSITION, "position" },
{ T_IGNORE, "IGNORE" },
{ T_UNDEFINED, "UNDEFINED" },
{ T_NUMERIC, "LC_NUMERIC" },
{ T_DECIMAL_POINT, "decimal_point" },
{ T_THOUSANDS_SEP, "thousands_sep" },
{ T_GROUPING, "grouping" },
{ T_TIME, "LC_TIME" },
{ T_ABDAY, "abday" },
{ T_DAY, "day" },
{ T_ABMON, "abmon" },
{ T_MON, "mon" },
{ T_D_T_FMT, "d_t_fmt" },
{ T_D_FMT, "d_fmt" },
{ T_T_FMT, "t_fmt" },
{ T_AM_PM, "am_pm" },
{ T_T_FMT_AMPM, "t_fmt_ampm" },
{ T_ERA, "era" },
{ T_ERA_D_FMT, "era_d_fmt" },
{ T_ERA_T_FMT, "era_t_fmt" },
{ T_ERA_D_T_FMT, "era_d_t_fmt" },
{ T_ALT_DIGITS, "alt_digits" },
{ T_CTYPE, "LC_CTYPE" },
{ T_ISUPPER, "upper" },
{ T_ISLOWER, "lower" },
{ T_ISALPHA, "alpha" },
{ T_ISDIGIT, "digit" },
{ T_ISPUNCT, "punct" },
{ T_ISXDIGIT, "xdigit" },
{ T_ISSPACE, "space" },
{ T_ISPRINT, "print" },
{ T_ISGRAPH, "graph" },
{ T_ISBLANK, "blank" },
{ T_ISCNTRL, "cntrl" },
/*
* These entries are local additions, and not specified by
* TOG. Note that they are not guaranteed to be accurate for
* all locales, and so applications should not depend on them.
*/
{ T_ISSPECIAL, "special" },
{ T_ISENGLISH, "english" },
{ T_ISPHONOGRAM, "phonogram" },
{ T_ISIDEOGRAM, "ideogram" },
{ T_ISNUMBER, "number" },
/*
* We have to support this in the grammar, but it would be a
* syntax error to define a character as one of these without
* also defining it as an alpha or digit. We ignore it in our
* parsing.
*/
{ T_ISALNUM, "alnum" },
{ T_TOUPPER, "toupper" },
{ T_TOLOWER, "tolower" },
/*
* These are keywords used in the charmap file. Note that
* Solaris orginally used angle brackets to wrap some of them,
* but we removed that to simplify our parser. The first of these
* items are "global items."
*/
{ T_CHARMAP, "CHARMAP" },
{ T_WIDTH, "WIDTH" },
{ -1, NULL },
};
/*
* These special words are only used in a charmap file, enclosed in <>.
*/
static struct token symwords[] = {
{ T_COM_CHAR, "comment_char" },
{ T_ESC_CHAR, "escape_char" },
{ T_CODE_SET, "code_set_name" },
{ T_MB_CUR_MAX, "mb_cur_max" },
{ T_MB_CUR_MIN, "mb_cur_min" },
{ -1, NULL },
};
static int categories[] = {
T_CHARMAP,
T_CTYPE,
T_COLLATE,
T_MESSAGES,
T_MONETARY,
T_NUMERIC,
T_TIME,
T_WIDTH,
0
};
void
reset_scanner(const char *fname)
{
if (fname == NULL) {
filename = "<stdin>";
is_stdin = 1;
} else {
if (!is_stdin)
(void) fclose(input);
if ((input = fopen(fname, "r")) == NULL) {
perror("fopen");
exit(4);
} else {
is_stdin = 0;
}
filename = fname;
}
com_char = '#';
esc_char = '\\';
instring = 0;
escaped = 0;
lineno = 1;
nextline = 1;
tokidx = 0;
wideidx = 0;
}
#define hex(x) \
(isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
#define isodigit(x) ((x >= '0') && (x <= '7'))
static int
scanc(void)
{
int c;
if (is_stdin)
c = getc(stdin);
else
c = getc(input);
lineno = nextline;
if (c == '\n') {
nextline++;
}
return (c);
}
static void
unscanc(int c)
{
if (c == '\n') {
nextline--;
}
if (ungetc(c, is_stdin ? stdin : input) < 0) {
yyerror("ungetc failed");
}
}
static int
scan_hex_byte(void)
{
int c1, c2;
int v;
c1 = scanc();
if (!isxdigit(c1)) {
yyerror("malformed hex digit");
return (0);
}
c2 = scanc();
if (!isxdigit(c2)) {
yyerror("malformed hex digit");
return (0);
}
v = ((hex(c1) << 4) | hex(c2));
return (v);
}
static int
scan_dec_byte(void)
{
int c1, c2, c3;
int b;
c1 = scanc();
if (!isdigit(c1)) {
yyerror("malformed decimal digit");
return (0);
}
b = c1 - '0';
c2 = scanc();
if (!isdigit(c2)) {
yyerror("malformed decimal digit");
return (0);
}
b *= 10;
b += (c2 - '0');
c3 = scanc();
if (!isdigit(c3)) {
unscanc(c3);
} else {
b *= 10;
b += (c3 - '0');
}
return (b);
}
static int
scan_oct_byte(void)
{
int c1, c2, c3;
int b;
b = 0;
c1 = scanc();
if (!isodigit(c1)) {
yyerror("malformed octal digit");
return (0);
}
b = c1 - '0';
c2 = scanc();
if (!isodigit(c2)) {
yyerror("malformed octal digit");
return (0);
}
b *= 8;
b += (c2 - '0');
c3 = scanc();
if (!isodigit(c3)) {
unscanc(c3);
} else {
b *= 8;
b += (c3 - '0');
}
return (b);
}
void
add_tok(int c)
{
if ((tokidx + 1) >= toksz) {
toksz += 64;
if ((token = realloc(token, toksz)) == NULL) {
yyerror("out of memory");
tokidx = 0;
toksz = 0;
return;
}
}
token[tokidx++] = (char)c;
token[tokidx] = 0;
}
void
add_wcs(wchar_t c)
{
if ((wideidx + 1) >= widesz) {
widesz += 64;
widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
if (widestr == NULL) {
yyerror("out of memory");
wideidx = 0;
widesz = 0;
return;
}
}
widestr[wideidx++] = c;
widestr[wideidx] = 0;
}
wchar_t *
get_wcs(void)
{
wchar_t *ws = widestr;
wideidx = 0;
widestr = NULL;
widesz = 0;
if (ws == NULL) {
if ((ws = wcsdup(L"")) == NULL) {
yyerror("out of memory");
}
}
return (ws);
}
static int
get_byte(void)
{
int c;
if ((c = scanc()) != esc_char) {
unscanc(c);
return (EOF);
}
c = scanc();
switch (c) {
case 'd':
case 'D':
return (scan_dec_byte());
case 'x':
case 'X':
return (scan_hex_byte());
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
/* put the character back so we can get it */
unscanc(c);
return (scan_oct_byte());
default:
unscanc(c);
unscanc(esc_char);
return (EOF);
}
}
int
get_escaped(int c)
{
switch (c) {
case 'n':
return ('\n');
case 'r':
return ('\r');
case 't':
return ('\t');
case 'f':
return ('\f');
case 'v':
return ('\v');
case 'b':
return ('\b');
case 'a':
return ('\a');
default:
return (c);
}
}
int
get_wide(void)
{
static char mbs[MB_LEN_MAX + 1] = "";
static int mbi = 0;
int c;
wchar_t wc;
if (mb_cur_max >= (int)sizeof (mbs)) {
yyerror("max multibyte character size too big");
mbi = 0;
return (T_NULL);
}
for (;;) {
if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
/*
* end of the byte sequence reached, but no
* valid wide decoding. fatal error.
*/
mbi = 0;
yyerror("not a valid character encoding");
return (T_NULL);
}
mbs[mbi++] = c;
mbs[mbi] = 0;
/* does it decode? */
if (to_wide(&wc, mbs) >= 0) {
break;
}
}
mbi = 0;
if ((category != T_CHARMAP) && (category != T_WIDTH)) {
if (check_charmap(wc) < 0) {
yyerror("no symbolic name for character");
return (T_NULL);
}
}
yylval.wc = wc;
return (T_CHAR);
}
int
get_symbol(void)
{
int c;
while ((c = scanc()) != EOF) {
if (escaped) {
escaped = 0;
if (c == '\n')
continue;
add_tok(get_escaped(c));
continue;
}
if (c == esc_char) {
escaped = 1;
continue;
}
if (c == '\n') { /* well that's strange! */
yyerror("unterminated symbolic name");
continue;
}
if (c == '>') { /* end of symbol */
/*
* This restarts the token from the beginning
* the next time we scan a character. (This
* token is complete.)
*/
if (token == NULL) {
yyerror("missing symbolic name");
return (T_NULL);
}
tokidx = 0;
/*
* A few symbols are handled as keywords outside
* of the normal categories.
*/
if (category == T_END) {
int i;
for (i = 0; symwords[i].name != 0; i++) {
if (strcmp(token, symwords[i].name) ==
0) {
last_kw = symwords[i].id;
return (last_kw);
}
}
}
/*
* Contextual rule: Only literal characters are
* permitted in CHARMAP. Anywhere else the symbolic
* forms are fine.
*/
if ((category != T_CHARMAP) &&
(lookup_charmap(token, &yylval.wc)) != -1) {
return (T_CHAR);
}
if ((yylval.collsym = lookup_collsym(token)) != NULL) {
return (T_COLLSYM);
}
if ((yylval.collelem = lookup_collelem(token)) !=
NULL) {
return (T_COLLELEM);
}
/* its an undefined symbol */
yylval.token = strdup(token);
token = NULL;
toksz = 0;
tokidx = 0;
return (T_SYMBOL);
}
add_tok(c);
}
yyerror("unterminated symbolic name");
return (EOF);
}
int
get_category(void)
{
return (category);
}
static int
consume_token(void)
{
int len = tokidx;
int i;
tokidx = 0;
if (token == NULL)
return (T_NULL);
/*
* this one is special, because we don't want it to alter the
* last_kw field.
*/
if (strcmp(token, "...") == 0) {
return (T_ELLIPSIS);
}
/* search for reserved words first */
for (i = 0; keywords[i].name; i++) {
int j;
if (strcmp(keywords[i].name, token) != 0) {
continue;
}
last_kw = keywords[i].id;
/* clear the top level category if we're done with it */
if (last_kw == T_END) {
category = T_END;
}
/* set the top level category if we're changing */
for (j = 0; categories[j]; j++) {
if (categories[j] != last_kw)
continue;
category = last_kw;
}
return (keywords[i].id);
}
/* maybe its a numeric constant? */
if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
char *eptr;
yylval.num = strtol(token, &eptr, 10);
if (*eptr != 0)
yyerror("malformed number");
return (T_NUMBER);
}
/*
* A single lone character is treated as a character literal.
* To avoid duplication of effort, we stick in the charmap.
*/
if (len == 1) {
yylval.wc = token[0];
return (T_CHAR);
}
/* anything else is treated as a symbolic name */
yylval.token = strdup(token);
token = NULL;
toksz = 0;
tokidx = 0;
return (T_NAME);
}
void
scan_to_eol(void)
{
int c;
while ((c = scanc()) != '\n') {
if (c == EOF) {
/* end of file without newline! */
errf("missing newline");
return;
}
}
assert(c == '\n');
}
int
yylex(void)
{
int c;
while ((c = scanc()) != EOF) {
/* special handling for quoted string */
if (instring) {
if (escaped) {
escaped = 0;
/* if newline, just eat and forget it */
if (c == '\n')
continue;
if (strchr("xXd01234567", c)) {
unscanc(c);
unscanc(esc_char);
return (get_wide());
}
yylval.wc = get_escaped(c);
return (T_CHAR);
}
if (c == esc_char) {
escaped = 1;
continue;
}
switch (c) {
case '<':
return (get_symbol());
case '>':
/* oops! should generate syntax error */
return (T_GT);
case '"':
instring = 0;
return (T_QUOTE);
default:
yylval.wc = c;
return (T_CHAR);
}
}
/* escaped characters first */
if (escaped) {
escaped = 0;
if (c == '\n') {
/* eat the newline */
continue;
}
hadtok = 1;
if (tokidx) {
/* an escape mid-token is nonsense */
return (T_NULL);
}
/* numeric escapes are treated as wide characters */
if (strchr("xXd01234567", c)) {
unscanc(c);
unscanc(esc_char);
return (get_wide());
}
add_tok(get_escaped(c));
continue;
}
/* if it is the escape charter itself note it */
if (c == esc_char) {
escaped = 1;
continue;
}
/* remove from the comment char to end of line */
if (c == com_char) {
while (c != '\n') {
if ((c = scanc()) == EOF) {
/* end of file without newline! */
return (EOF);
}
}
assert(c == '\n');
if (!hadtok) {
/*
* If there were no tokens on this line,
* then just pretend it didn't exist at all.
*/
continue;
}
hadtok = 0;
return (T_NL);
}
if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
/*
* These are all token delimiters. If there
* is a token already in progress, we need to
* process it.
*/
unscanc(c);
return (consume_token());
}
switch (c) {
case '\n':
if (!hadtok) {
/*
* If the line was completely devoid of tokens,
* then just ignore it.
*/
continue;
}
/* we're starting a new line, reset the token state */
hadtok = 0;
return (T_NL);
case ',':
hadtok = 1;
return (T_COMMA);
case ';':
hadtok = 1;
return (T_SEMI);
case '(':
hadtok = 1;
return (T_LPAREN);
case ')':
hadtok = 1;
return (T_RPAREN);
case '>':
hadtok = 1;
return (T_GT);
case '<':
/* symbol start! */
hadtok = 1;
return (get_symbol());
case ' ':
case '\t':
/* whitespace, just ignore it */
continue;
case '"':
hadtok = 1;
instring = 1;
return (T_QUOTE);
default:
hadtok = 1;
add_tok(c);
continue;
}
}
return (EOF);
}
void
yyerror(const char *msg)
{
(void) fprintf(stderr, "%s: %d: error: %s\n",
filename, lineno, msg);
exit(4);
}
void
errf(const char *fmt, ...)
{
char *msg;
va_list va;
va_start(va, fmt);
(void) vasprintf(&msg, fmt, va);
va_end(va);
(void) fprintf(stderr, "%s: %d: error: %s\n",
filename, lineno, msg);
free(msg);
exit(4);
}
void
warn(const char *fmt, ...)
{
char *msg;
va_list va;
va_start(va, fmt);
(void) vasprintf(&msg, fmt, va);
va_end(va);
(void) fprintf(stderr, "%s: %d: warning: %s\n",
filename, lineno, msg);
free(msg);
warnings++;
if (!warnok)
exit(4);
}