Add localedef(1), a locale definition generator tool

The localedef tool can read entire (and unmodified) CLDR posix definition
files, and generate all 6 LC categories: LC_COLLATE, LC_CTYPE, LC_TIME,
LC_NUMERIC, LC_MONETARY and LC_MESSAGES.

This tool has a long history with Solaris.  The Nexenta developers
modified it to read CLDR files and created the much richer collation
formats.  The libc collation functions have to be modified to read the
new format (called "BSD-1.0") and to handle the new data structures.

The result will be that locale-sensitive tools and functions will now
properly sort multibyte and unicode strings.

Obtained from:	Dragonfly
This commit is contained in:
Baptiste Daroussin 2015-08-07 23:53:31 +00:00
parent 7be46fbdfd
commit 057ca2d437
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/projects/collation/; revision=286432
16 changed files with 5907 additions and 0 deletions

View File

@ -84,6 +84,7 @@ SUBDIR= ${_addr2line} \
lesskey \
limits \
locale \
localedef \
lock \
lockf \
logger \

View File

@ -0,0 +1,30 @@
# $FreeBSD$
PROG= localedef
SRCS= charmap.c \
collate.c \
ctype.c \
localedef.c \
messages.c \
monetary.c \
numeric.c \
parser.y \
scanner.c \
time.c \
wide.c
WARNS= 3
${SRCS:M*.c}: parser.h
parser.h: parser.y
LIBADD= avl
IGNORE_PRAGMA= yes
CFLAGS+= -DNEED_SOLARIS_BOOLEAN
CFLAGS+= -I. -I${.CURDIR}
CFLAGS+= -I${.CURDIR}/../../lib/libc/locale
CFLAGS+= -I${.CURDIR}/../../lib/libc/stdtime
CFLAGS+= -I${.CURDIR}/../../sys/cddl/compat/opensolaris
CFLAGS+= -I${.CURDIR}/../../sys/cddl/contrib/opensolaris/uts/common
.include <bsd.prog.mk>

11
usr.bin/localedef/README Normal file
View File

@ -0,0 +1,11 @@
While there are tools called "localedef" in Solaris and Linux, this
tool does not share heritage with any other implementation. It was
written independently by Garrett D'Amore while employed at Nexenta
Systems, and thus carries the Nexenta Copyright.
It was initially released under the CDDL license, but on 4 July 2014,
Nexenta reissued the source under the BSD 2-clause license. This
code is part of the Illumos project.
see:
https://github.com/Nexenta/illumos-nexenta/commit/cf17542a37fc83d0ae093777e30d480423858c29

364
usr.bin/localedef/charmap.c Normal file
View File

@ -0,0 +1,364 @@
/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* CHARMAP file handling for localedef.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <sys/avl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <unistd.h>
#include <stddef.h>
#include <unistd.h>
#include "localedef.h"
#include "parser.h"
static avl_tree_t cmap_sym;
static avl_tree_t cmap_wc;
typedef struct charmap {
const char *name;
wchar_t wc;
avl_node_t avl_sym;
avl_node_t avl_wc;
} charmap_t;
/*
* Array of POSIX specific portable characters.
*/
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
static const struct {
char *name;
int ch;
} portable_chars[] = {
{ "NUL", '\0' },
{ "alert", '\a' },
{ "backspace", '\b' },
{ "tab", '\t' },
{ "carriage-return", '\r' },
{ "newline", '\n' },
{ "vertical-tab", '\v' },
{ "form-feed", '\f' },
{ "space", ' ' },
{ "exclamation-mark", '!' },
{ "quotation-mark", '"' },
{ "number-sign", '#' },
{ "dollar-sign", '$' },
{ "percent-sign", '%' },
{ "ampersand", '&' },
{ "apostrophe", '\'' },
{ "left-parenthesis", '(' },
{ "right-parenthesis", '(' },
{ "asterisk", '*' },
{ "plus-sign", '+' },
{ "comma", ','},
{ "hyphen-minus", '-' },
{ "hyphen", '-' },
{ "full-stop", '.' },
{ "period", '.' },
{ "slash", '/' },
{ "solidus", '/' },
{ "zero", '0' },
{ "one", '1' },
{ "two", '2' },
{ "three", '3' },
{ "four", '4' },
{ "five", '5' },
{ "six", '6' },
{ "seven", '7' },
{ "eight", '8' },
{ "nine", '9' },
{ "colon", ':' },
{ "semicolon", ';' },
{ "less-than-sign", '<' },
{ "equals-sign", '=' },
{ "greater-than-sign", '>' },
{ "question-mark", '?' },
{ "commercial-at", '@' },
{ "left-square-bracket", '[' },
{ "backslash", '\\' },
{ "reverse-solidus", '\\' },
{ "right-square-bracket", ']' },
{ "circumflex", '^' },
{ "circumflex-accent", '^' },
{ "low-line", '_' },
{ "underscore", '_' },
{ "grave-accent", '`' },
{ "left-brace", '{' },
{ "left-curly-bracket", '{' },
{ "vertical-line", '|' },
{ "right-brace", '}' },
{ "right-curly-bracket", '}' },
{ "tilde", '~' },
{ "A", 'A' },
{ "B", 'B' },
{ "C", 'C' },
{ "D", 'D' },
{ "E", 'E' },
{ "F", 'F' },
{ "G", 'G' },
{ "H", 'H' },
{ "I", 'I' },
{ "J", 'J' },
{ "K", 'K' },
{ "L", 'L' },
{ "M", 'M' },
{ "N", 'N' },
{ "O", 'O' },
{ "P", 'P' },
{ "Q", 'Q' },
{ "R", 'R' },
{ "S", 'S' },
{ "T", 'T' },
{ "U", 'U' },
{ "V", 'V' },
{ "W", 'W' },
{ "X", 'X' },
{ "Y", 'Y' },
{ "Z", 'Z' },
{ "a", 'a' },
{ "b", 'b' },
{ "c", 'c' },
{ "d", 'd' },
{ "e", 'e' },
{ "f", 'f' },
{ "g", 'g' },
{ "h", 'h' },
{ "i", 'i' },
{ "j", 'j' },
{ "k", 'k' },
{ "l", 'l' },
{ "m", 'm' },
{ "n", 'n' },
{ "o", 'o' },
{ "p", 'p' },
{ "q", 'q' },
{ "r", 'r' },
{ "s", 's' },
{ "t", 't' },
{ "u", 'u' },
{ "v", 'v' },
{ "w", 'w' },
{ "x", 'x' },
{ "y", 'y' },
{ "z", 'z' },
{ NULL, 0 }
};
#pragma GCC diagnostic pop
static int
cmap_compare_sym(const void *n1, const void *n2)
{
const charmap_t *c1 = n1;
const charmap_t *c2 = n2;
int rv;
rv = strcmp(c1->name, c2->name);
return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
}
static int
cmap_compare_wc(const void *n1, const void *n2)
{
const charmap_t *c1 = n1;
const charmap_t *c2 = n2;
return ((c1->wc < c2->wc) ? -1 : (c1->wc > c2->wc) ? 1 : 0);
}
void
init_charmap(void)
{
avl_create(&cmap_sym, cmap_compare_sym, sizeof (charmap_t),
offsetof(charmap_t, avl_sym));
avl_create(&cmap_wc, cmap_compare_wc, sizeof (charmap_t),
offsetof(charmap_t, avl_wc));
}
static void
add_charmap_impl(char *sym, wchar_t wc, int nodups)
{
charmap_t srch;
charmap_t *n = NULL;
avl_index_t where;
srch.wc = wc;
srch.name = sym;
/*
* also possibly insert the wide mapping, although note that there
* can only be one of these per wide character code.
*/
if ((wc != -1) && ((avl_find(&cmap_wc, &srch, &where)) == NULL)) {
if ((n = calloc(1, sizeof (*n))) == NULL) {
errf("out of memory");
return;
}
n->wc = wc;
avl_insert(&cmap_wc, n, where);
}
if (sym) {
if (avl_find(&cmap_sym, &srch, &where) != NULL) {
if (nodups) {
errf("duplicate character definition");
}
return;
}
if ((n == NULL) && ((n = calloc(1, sizeof (*n))) == NULL)) {
errf("out of memory");
return;
}
n->wc = wc;
n->name = sym;
avl_insert(&cmap_sym, n, where);
}
}
void
add_charmap(char *sym, int c)
{
add_charmap_impl(sym, c, 1);
}
void
add_charmap_undefined(char *sym)
{
charmap_t srch;
charmap_t *cm = NULL;
srch.name = sym;
cm = avl_find(&cmap_sym, &srch, NULL);
if ((undefok == 0) && ((cm == NULL) || (cm->wc == -1))) {
warn("undefined symbol <%s>", sym);
add_charmap_impl(sym, -1, 0);
} else {
free(sym);
}
}
void
add_charmap_range(char *s, char *e, int wc)
{
int ls, le;
int si;
int sn, en;
int i;
static const char *digits = "0123456789";
ls = strlen(s);
le = strlen(e);
if (((si = strcspn(s, digits)) == 0) || (si == ls) ||
(strncmp(s, e, si) != 0) ||
((int)strspn(s + si, digits) != (ls - si)) ||
((int)strspn(e + si, digits) != (le - si)) ||
((sn = atoi(s + si)) > ((en = atoi(e + si))))) {
errf("malformed charmap range");
return;
}
s[si] = 0;
for (i = sn; i <= en; i++) {
char *nn;
(void) asprintf(&nn, "%s%0*u", s, ls - si, i);
if (nn == NULL) {
errf("out of memory");
return;
}
add_charmap_impl(nn, wc, 1);
wc++;
}
free(s);
free(e);
}
void
add_charmap_char(char *name, int val)
{
add_charmap_impl(name, val, 0);
}
/*
* POSIX insists that certain entries be present, even when not in the
* orginal charmap file.
*/
void
add_charmap_posix(void)
{
int i;
for (i = 0; portable_chars[i].name; i++) {
add_charmap_char(portable_chars[i].name, portable_chars[i].ch);
}
}
int
lookup_charmap(const char *sym, wchar_t *wc)
{
charmap_t srch;
charmap_t *n;
srch.name = sym;
n = avl_find(&cmap_sym, &srch, NULL);
if (n && n->wc != -1) {
if (wc)
*wc = n->wc;
return (0);
}
return (-1);
}
int
check_charmap(wchar_t wc)
{
charmap_t srch;
srch.wc = wc;
return (avl_find(&cmap_wc, &srch, NULL) ? 0 : -1);
}

1299
usr.bin/localedef/collate.c Normal file

File diff suppressed because it is too large Load Diff

464
usr.bin/localedef/ctype.c Normal file
View File

@ -0,0 +1,464 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* LC_CTYPE database generation routines for localedef.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/avl.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <sys/types.h>
#include <wchar.h>
#include <ctype.h>
#include <wctype.h>
#include <unistd.h>
#include "localedef.h"
#include "parser.h"
#include "runefile.h"
#define _ISUPPER _CTYPE_U
#define _ISLOWER _CTYPE_L
#define _ISDIGIT _CTYPE_D
#define _ISXDIGIT _CTYPE_X
#define _ISSPACE _CTYPE_S
#define _ISBLANK _CTYPE_B
#define _ISALPHA _CTYPE_A
#define _ISPUNCT _CTYPE_P
#define _ISGRAPH _CTYPE_G
#define _ISPRINT _CTYPE_R
#define _ISCNTRL _CTYPE_C
#define _E1 _CTYPE_Q
#define _E2 _CTYPE_I
#define _E3 0
#define _E4 0
#define _E5 _CTYPE_T
static avl_tree_t ctypes;
static wchar_t last_ctype;
typedef struct ctype_node {
wchar_t wc;
int32_t ctype;
int32_t toupper;
int32_t tolower;
avl_node_t avl;
} ctype_node_t;
typedef struct width_node {
wchar_t start;
wchar_t end;
int8_t width;
avl_node_t avl;
} width_node_t;
static int
ctype_compare(const void *n1, const void *n2)
{
const ctype_node_t *c1 = n1;
const ctype_node_t *c2 = n2;
return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
}
void
init_ctype(void)
{
avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
offsetof(ctype_node_t, avl));
}
static void
add_ctype_impl(ctype_node_t *ctn)
{
switch (last_kw) {
case T_ISUPPER:
ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
break;
case T_ISLOWER:
ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
break;
case T_ISALPHA:
ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
break;
case T_ISDIGIT:
ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
break;
case T_ISSPACE:
ctn->ctype |= _ISSPACE;
break;
case T_ISCNTRL:
ctn->ctype |= _ISCNTRL;
break;
case T_ISGRAPH:
ctn->ctype |= (_ISGRAPH | _ISPRINT);
break;
case T_ISPRINT:
ctn->ctype |= _ISPRINT;
break;
case T_ISPUNCT:
ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
break;
case T_ISXDIGIT:
ctn->ctype |= (_ISXDIGIT | _ISPRINT);
break;
case T_ISBLANK:
ctn->ctype |= (_ISBLANK | _ISSPACE);
break;
case T_ISPHONOGRAM:
ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
break;
case T_ISIDEOGRAM:
ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
break;
case T_ISENGLISH:
ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
break;
case T_ISNUMBER:
ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
break;
case T_ISSPECIAL:
ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
break;
case T_ISALNUM:
/*
* We can't do anything with this. The character
* should already be specified as a digit or alpha.
*/
break;
default:
errf("not a valid character class");
}
}
static ctype_node_t *
get_ctype(wchar_t wc)
{
ctype_node_t srch;
ctype_node_t *ctn;
avl_index_t where;
srch.wc = wc;
if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
errf("out of memory");
return (NULL);
}
ctn->wc = wc;
avl_insert(&ctypes, ctn, where);
}
return (ctn);
}
void
add_ctype(int val)
{
ctype_node_t *ctn;
if ((ctn = get_ctype(val)) == NULL) {
INTERR;
return;
}
add_ctype_impl(ctn);
last_ctype = ctn->wc;
}
void
add_ctype_range(int end)
{
ctype_node_t *ctn;
wchar_t cur;
if (end < last_ctype) {
errf("malformed character range (%u ... %u))",
last_ctype, end);
return;
}
for (cur = last_ctype + 1; cur <= end; cur++) {
if ((ctn = get_ctype(cur)) == NULL) {
INTERR;
return;
}
add_ctype_impl(ctn);
}
last_ctype = end;
}
/*
* A word about widths: if the width mask is specified, then libc
* unconditionally honors it. Otherwise, it assumes printable
* characters have width 1, and non-printable characters have width
* -1 (except for NULL which is special with with 0). Hence, we have
* no need to inject defaults here -- the "default" unset value of 0
* indicates that libc should use its own logic in wcwidth as described.
*/
void
add_width(int wc, int width)
{
ctype_node_t *ctn;
if ((ctn = get_ctype(wc)) == NULL) {
INTERR;
return;
}
ctn->ctype &= ~(_CTYPE_SWM);
switch (width) {
case 0:
ctn->ctype |= _CTYPE_SW0;
break;
case 1:
ctn->ctype |= _CTYPE_SW1;
break;
case 2:
ctn->ctype |= _CTYPE_SW2;
break;
case 3:
ctn->ctype |= _CTYPE_SW3;
break;
}
}
void
add_width_range(int start, int end, int width)
{
for (; start <= end; start++) {
add_width(start, width);
}
}
void
add_caseconv(int val, int wc)
{
ctype_node_t *ctn;
ctn = get_ctype(val);
if (ctn == NULL) {
INTERR;
return;
}
switch (last_kw) {
case T_TOUPPER:
ctn->toupper = wc;
break;
case T_TOLOWER:
ctn->tolower = wc;
break;
default:
INTERR;
break;
}
}
void
dump_ctype(void)
{
FILE *f;
_FileRuneLocale rl;
ctype_node_t *ctn, *last_ct, *last_lo, *last_up;
_FileRuneEntry *ct = NULL;
_FileRuneEntry *lo = NULL;
_FileRuneEntry *up = NULL;
wchar_t wc;
(void) memset(&rl, 0, sizeof (rl));
last_ct = NULL;
last_lo = NULL;
last_up = NULL;
if ((f = open_category()) == NULL)
return;
(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
(void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
/*
* Initialize the identity map.
*/
for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
rl.maplower[wc] = wc;
rl.mapupper[wc] = wc;
}
for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
int conflict = 0;
wc = ctn->wc;
/*
* POSIX requires certain portable characters have
* certain types. Add them if they are missing.
*/
if ((wc >= 1) && (wc <= 127)) {
if ((wc >= 'A') && (wc <= 'Z'))
ctn->ctype |= _ISUPPER;
if ((wc >= 'a') && (wc <= 'z'))
ctn->ctype |= _ISLOWER;
if ((wc >= '0') && (wc <= '9'))
ctn->ctype |= _ISDIGIT;
if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
ctn->ctype |= _ISSPACE;
if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
ctn->ctype |= _ISXDIGIT;
if (strchr(" \t", (char)wc))
ctn->ctype |= _ISBLANK;
/*
* Technically these settings are only
* required for the C locale. However, it
* turns out that because of the historical
* version of isprint(), we need them for all
* locales as well. Note that these are not
* necessarily valid punctation characters in
* the current language, but ispunct() needs
* to return TRUE for them.
*/
if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
(char)wc))
ctn->ctype |= _ISPUNCT;
}
/*
* POSIX also requires that certain types imply
* others. Add any inferred types here.
*/
if (ctn->ctype & (_ISUPPER |_ISLOWER))
ctn->ctype |= _ISALPHA;
if (ctn->ctype & _ISDIGIT)
ctn->ctype |= _ISXDIGIT;
if (ctn->ctype & _ISBLANK)
ctn->ctype |= _ISSPACE;
if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
ctn->ctype |= _ISGRAPH;
if (ctn->ctype & _ISGRAPH)
ctn->ctype |= _ISPRINT;
/*
* Finally, POSIX requires that certain combinations
* are invalid. We don't flag this as a fatal error,
* but we will warn about.
*/
if ((ctn->ctype & _ISALPHA) &&
(ctn->ctype & (_ISPUNCT|_ISDIGIT)))
conflict++;
if ((ctn->ctype & _ISPUNCT) &
(ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
conflict++;
if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
conflict++;
if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
conflict++;
if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
conflict++;
if (conflict) {
warn("conflicting classes for character 0x%x (%x)",
wc, ctn->ctype);
}
/*
* Handle the lower 256 characters using the simple
* optimization. Note that if we have not defined the
* upper/lower case, then we identity map it.
*/
if ((unsigned)wc < _CACHED_RUNES) {
rl.runetype[wc] = ctn->ctype;
if (ctn->tolower)
rl.maplower[wc] = ctn->tolower;
if (ctn->toupper)
rl.mapupper[wc] = ctn->toupper;
continue;
}
if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
ct[rl.runetype_ext_nranges-1].max = wc;
last_ct = ctn;
} else {
rl.runetype_ext_nranges++;
ct = realloc(ct,
sizeof (*ct) * rl.runetype_ext_nranges);
ct[rl.runetype_ext_nranges - 1].min = wc;
ct[rl.runetype_ext_nranges - 1].max = wc;
ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
last_ct = ctn;
}
if (ctn->tolower == 0) {
last_lo = NULL;
} else if ((last_lo != NULL) &&
(last_lo->tolower + 1 == ctn->tolower)) {
lo[rl.maplower_ext_nranges-1].max = wc;
last_lo = ctn;
} else {
rl.maplower_ext_nranges++;
lo = realloc(lo,
sizeof (*lo) * rl.maplower_ext_nranges);
lo[rl.maplower_ext_nranges - 1].min = wc;
lo[rl.maplower_ext_nranges - 1].max = wc;
lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
last_lo = ctn;
}
if (ctn->toupper == 0) {
last_up = NULL;
} else if ((last_up != NULL) &&
(last_up->toupper + 1 == ctn->toupper)) {
up[rl.mapupper_ext_nranges-1].max = wc;
last_up = ctn;
} else {
rl.mapupper_ext_nranges++;
up = realloc(up,
sizeof (*up) * rl.mapupper_ext_nranges);
up[rl.mapupper_ext_nranges - 1].min = wc;
up[rl.mapupper_ext_nranges - 1].max = wc;
up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
last_up = ctn;
}
}
if ((wr_category(&rl, sizeof (rl), f) < 0) ||
(wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
(wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
(wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
return;
}
close_category(f);
}

View File

@ -0,0 +1,238 @@
.\" Copyright (c) 1992, X/Open Company Limited All Rights Reserved
.\" Portions Copyright (c) 2003, Sun Microsystems, Inc. All Rights Reserved
.\" Portions Copyright 2013 DEY Storage Systems, Inc.
.\" Sun Microsystems, Inc. gratefully acknowledges The Open Group for
.\" permission to reproduce portions of its copyrighted documentation.
.\" Original documentation from The Open Group can be obtained online at
.\" http://www.opengroup.org/bookstore/.
.\" The Institute of Electrical and Electronics Engineers and The Open Group,
.\" have given us permission to reprint portions of their documentation. In
.\" the following statement, the phrase "this text" refers to portions of the
.\" system documentation. Portions of this text are reprinted and reproduced
.\" in electronic form in the Sun OS Reference Manual, from IEEE Std 1003.1,
.\" 2004 Edition, Standard for Information Technology -- Portable Operating
.\" System Interface (POSIX), The Open Group Base Specifications Issue 6,
.\" Copyright (C) 2001-2004 by the Institute of Electrical and Electronics
.\" Engineers, Inc and The Open Group. In the event of any discrepancy between
.\" these versions and the original IEEE and The Open Group Standard, the
.\" original IEEE and The Open Group Standard is the referee document. The
.\" original Standard can be obtained online at
.\" http://www.opengroup.org/unix/online.html.
.\" This notice shall appear on any product containing this material.
.\" The contents of this file are subject to the terms of the Common
.\" Development and Distribution License (the "License"). You may not use
.\" this file except in compliance with the License.
.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or
.\" http://www.opensolaris.org/os/licensing. See the License for the specific
.\" language governing permissions and limitations under the License.
.\" When distributing Covered Code, include this CDDL HEADER in each file and
.\" include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable,
.\" add the following below this CDDL HEADER, with the fields enclosed by
.\" brackets "[]" replaced with your own identifying information:
.\" Portions Copyright [yyyy] [name of copyright owner]
.Dd July 28, 2015
.Dt LOCALEDEF 1
.Os
.Sh NAME
.Nm localedef
.Nd define locale environment
.Sh SYNOPSIS
.Nm
.Op Fl D
.Op Fl c
.Op Fl v
.Op Fl U
.Op Fl f Ar charmap
.Op Fl w Ar widthfile
.Op Fl i Ar sourcefile
.Op Fl u Ar codeset
localename
.Sh DESCRIPTION
The
.Nm localedef
utility converts source definitions for locale categories
into a format usable by the functions and utilities whose operational behavior
is determined by the setting of the locale environment variables; see
.Xr environ 5 .
.Pp
The utility reads source definitions for one or more locale categories
belonging to the same locale from the file named in the \fB-i\fR option (if
specified) or from standard input.
.Pp
Each category source definition is identified by the corresponding environment
variable name and terminated by an
.Sy END
.Em category-name
statement. The following categories are supported:
.Bl -tag -width LC_MONETARY
.It LC_CTYPE
Defines character classification and case conversion.
.It LC_COLLATE
Defines collation rules.
.It LC_MONETARY
Defines the format and symbols used in formatting of monetary information.
.It LC_NUMERIC
Defines the decimal delimiter, grouping and grouping symbol for non-monetary
numeric editing.
.It LC_TIME
Defines the format and content of date and time information.
.It LC_MESSAGES
Defines the format and values of affirmative and negative responses.
.El
.Pp
The following options are supported:
.Bl -tag -width xx_sourcefile
.It -D
BSD-style output. Rather than the default of creating the
.Sy localename
directory and creating files like LC_CTYPE, LC_COLLATE, etc, in that directory,
the output files have the format "<localename>.<category>" and are
dumped to the current directory.
.It -c
Creates permanent output even if warning messages have been issued.
.It -v
Emit verbose debugging output on standard output.
.It -U
Ignore the presence of character symbols that have no matching character
definition. This facilitates the use of a common locale definition file
to be used across multiple encodings, even when some symbols are not
present in a given encoding.
.It -f charmap
Specifies the pathname of a file containing a mapping of character symbols and
collating element symbols to actual character encodings. This option must be
specified if symbolic names (other than collating symbols defined in a
.Sy collating-symbol
keyword) are used. If the
.Sy -f
option is not present, the default character mapping will be used.
.It -w widthfile
The path name of the file containing character screen width definitions.
If not supplied, then default screen widths will be assumed, which will
generally not account for East Asian encodings requiring more than a single
character cell to display, nor for combining or accent marks that occupy
no additional screen width.
.It -i sourcefile
The path name of a file containing the source definitions. If this option is
not present, source definitions will be read from standard input.
.It -u codeset
Specifies the name of a codeset used as the target mapping of character symbols
and collating element symbols whose encoding values are defined in terms of the
ISO/IEC 10646-1: 2000 standard position constant values. See NOTES.
.El
.Pp
The following operands are required:
.Bl -tag -width localename
.It localename
Identifies the locale. If the name contains one or more slash characters,
.Ar localename
will be interpreted as a path name where the created locale
definitions will be stored. This capability may be restricted to users with
appropriate privileges. (As a consequence of specifying one
.Ar localename ,
although several categories can be processed in one execution, only categories
belonging to the same locale can be processed.)
.El
.Sh OUTPUT
.Nm
creates a directory of files that represents the locale's data, unless instructed
otherwise by the
.Sy -D
(BSD output) option. The contants of this directory should generally be
copied into the appropriate subdirectory of /usr/share/locale in order the
definitions to be visible to programs linked with libc.
.Sh ENVIRONMENT
See
.Xr Benviron 5
for definitions of the following environment variables that affect the execution of
.Nm :
.Sy LANG ,
.Sy LC_ALL ,
.Sy LC_COLLATE ,
.Sy LC_CTYPE ,
.Sy LC_MESSAGES ,
.Sy LC_MONETARY ,
.Sy LC_MUMERIC ,
.Sy LC_TIME ,
and
.Sy NLSPATH .
.Sh EXIT STATUS
The following exit values are returned:
.Bl -tag -width XX
.It 0
No errors occurred and the locales were successfully created.
.It 1
Warnings occurred and the locales were successfully created.
.It 2
The locale specification exceeded implementation limits or the coded character
set or sets used were not supported by the implementation, and no locale was
created.
.It >3
Warnings or errors occurred and no output was created.
.El
.Pp
If an error is detected, no permanent output will be created.
.Sh SEE ALSO
.Xr locale 1 ,
.Xr iconv_open 3 ,
.Xr nl_langinfo 3 ,
.Xr strftime 3 ,
.Xr environ 5 .
.Sh WARNINGS
If warnings occur, permanent output will be created if the
.Sy -c
option was specified. The following conditions will cause warning messages to be issued:
.Bl -tag -width X
.It *
If a symbolic name not found in the
.Em charmap
file is used for the descriptions of the
.Sy LC_CTYPE
or
.Sy LC_COLLATE
categories (for other categories, this will be an error condition).
.It *
If optional keywords not supported by the implementation are present in the
source.
.El
.Sh NOTES
When the
.Sy -u
option is used, the
.Em codeset
option-argument is interpreted as a name of a codeset to which the
ISO/IEC 10646-1: 2000 standard position constant values are converted. Both the
ISO/IEC 10646-1: 2000 standard position constant values and other formats (decimal,
hexadecimal, or octal) are valid as encoding values within the charmap file. The
codeset can be any codeset that is supported by the \fBiconv_open\fR(3C) function
on the system.
.Pp
When conflicts occur between the charmap specification of
.Em codeset ,
.Em mb_cur_max ,
or
.Em mb_cur_min
and the corresponding value for the codeset represented by the
.Sy -u
option-argument
.Em codeset ,
the
.Nm
utility fails as an error.
.Pp
When conflicts occur between the charmap encoding values specified for symbolic
names of characters of the portable character set and the character encoding
values defined by the US-ASCII, the result is unspecified.
.Sh HISTORY
.Nm
first appeared in
.Dx
4.4. It was ported from Illumos from the point
.An Garrett D'Amore
.Aq garrett@nexenta.com
added multibyte support (October 2010).
.An John Marino
.Aq draco@marino.st
provided the alternations necessary to compile cleanly on
.Dx
as well as altered libc to use the new collation (the changes were also based
on Illumos, but modified to work with xlocale functionality.)

View File

@ -0,0 +1,346 @@
/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* POSIX localedef.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#include <unistd.h>
#include <libgen.h>
#include <stddef.h>
#include <unistd.h>
#include <limits.h>
#include <locale.h>
#include <dirent.h>
#include "localedef.h"
#include "parser.h"
#ifndef TEXT_DOMAIN
#define TEXT_DOMAIN "SYS_TEST"
#endif
int bsd = 0;
int verbose = 0;
int undefok = 0;
int warnok = 0;
static char *locname = NULL;
static char locpath[PATH_MAX];
const char *
category_name(void)
{
switch (get_category()) {
case T_CHARMAP:
return ("CHARMAP");
case T_WIDTH:
return ("WIDTH");
case T_COLLATE:
return ("LC_COLLATE");
case T_CTYPE:
return ("LC_CTYPE");
case T_MESSAGES:
return ("LC_MESSAGES");
case T_MONETARY:
return ("LC_MONETARY");
case T_NUMERIC:
return ("LC_NUMERIC");
case T_TIME:
return ("LC_TIME");
default:
INTERR;
return (NULL);
}
}
static char *
category_file(void)
{
if (bsd)
(void) snprintf(locpath, sizeof (locpath), "%s.%s",
locname, category_name());
else
(void) snprintf(locpath, sizeof (locpath), "%s/%s",
locname, category_name());
return (locpath);
}
FILE *
open_category(void)
{
FILE *file;
if (verbose) {
(void) printf("Writing category %s: ", category_name());
(void) fflush(stdout);
}
/* make the parent directory */
if (!bsd)
(void) mkdir(dirname(category_file()), 0755);
/*
* note that we have to regenerate the file name, as dirname
* clobbered it.
*/
file = fopen(category_file(), "w");
if (file == NULL) {
errf(strerror(errno));
return (NULL);
}
return (file);
}
void
close_category(FILE *f)
{
if (fchmod(fileno(f), 0644) < 0) {
(void) fclose(f);
(void) unlink(category_file());
errf(strerror(errno));
}
if (fclose(f) < 0) {
(void) unlink(category_file());
errf(strerror(errno));
}
if (verbose) {
(void) fprintf(stdout, "done.\n");
(void) fflush(stdout);
}
}
/*
* This function is used when copying the category from another
* locale. Note that the copy is actually performed using a hard
* link for efficiency.
*/
void
copy_category(char *src)
{
char srcpath[PATH_MAX];
int rv;
(void) snprintf(srcpath, sizeof (srcpath), "%s/%s",
src, category_name());
rv = access(srcpath, R_OK);
if ((rv != 0) && (strchr(srcpath, '/') == NULL)) {
/* Maybe we should try the system locale */
(void) snprintf(srcpath, sizeof (srcpath),
"/usr/lib/locale/%s/%s", src, category_name());
rv = access(srcpath, R_OK);
}
if (rv != 0) {
fprintf(stderr,"source locale data unavailable: %s", src);
return;
}
if (verbose > 1) {
(void) printf("Copying category %s from %s: ",
category_name(), src);
(void) fflush(stdout);
}
/* make the parent directory */
if (!bsd)
(void) mkdir(dirname(category_file()), 0755);
if (link(srcpath, category_file()) != 0) {
fprintf(stderr,"unable to copy locale data: %s",
strerror(errno));
return;
}
if (verbose > 1) {
(void) printf("done.\n");
}
}
int
putl_category(const char *s, FILE *f)
{
if (s && fputs(s, f) == EOF) {
(void) fclose(f);
(void) unlink(category_file());
errf(strerror(errno));
return (EOF);
}
if (fputc('\n', f) == EOF) {
(void) fclose(f);
(void) unlink(category_file());
errf(strerror(errno));
return (EOF);
}
return (0);
}
int
wr_category(void *buf, size_t sz, FILE *f)
{
if (!sz) {
return (0);
}
if (fwrite(buf, sz, 1, f) < 1) {
(void) fclose(f);
(void) unlink(category_file());
errf(strerror(errno));
return (EOF);
}
return (0);
}
int yyparse(void);
static void
usage(void)
{
(void) fprintf(stderr, "Usage: localedef [options] localename\n");
(void) fprintf(stderr, "[options] are:\n");
(void) fprintf(stderr, " -D : BSD-style output\n");
(void) fprintf(stderr, " -c : ignore warnings\n");
(void) fprintf(stderr, " -v : verbose output\n");
(void) fprintf(stderr, " -U : ignore undefined symbols\n");
(void) fprintf(stderr, " -f charmap : use given charmap file\n");
(void) fprintf(stderr, " -u encoding : assume encoding\n");
(void) fprintf(stderr, " -w widths : use screen widths file\n");
(void) fprintf(stderr, " -i locsrc : source file for locale\n");
exit(4);
}
int
main(int argc, char **argv)
{
int c;
char *lfname = NULL;
char *cfname = NULL;
char *wfname = NULL;
DIR *dir;
init_charmap();
init_collate();
init_ctype();
init_messages();
init_monetary();
init_numeric();
init_time();
yydebug = 0;
(void) setlocale(LC_ALL, "");
while ((c = getopt(argc, argv, "w:i:cf:u:vUD")) != -1) {
switch (c) {
case 'D':
bsd = 1;
break;
case 'v':
verbose++;
break;
case 'i':
lfname = optarg;
break;
case 'u':
set_wide_encoding(optarg);
break;
case 'f':
cfname = optarg;
break;
case 'U':
undefok++;
break;
case 'c':
warnok++;
break;
case 'w':
wfname = optarg;
break;
case '?':
usage();
break;
}
}
if ((argc - 1) != (optind)) {
usage();
}
locname = argv[argc - 1];
if (verbose) {
(void) printf("Processing locale %s.\n", locname);
}
if (cfname) {
if (verbose)
(void) printf("Loading charmap %s.\n", cfname);
reset_scanner(cfname);
(void) yyparse();
}
if (wfname) {
if (verbose)
(void) printf("Loading widths %s.\n", wfname);
reset_scanner(wfname);
(void) yyparse();
}
if (verbose) {
(void) printf("Loading POSIX portable characters.\n");
}
add_charmap_posix();
if (lfname) {
reset_scanner(lfname);
} else {
reset_scanner(NULL);
}
/* make the directory for the locale if not already present */
if (!bsd) {
while ((dir = opendir(locname)) == NULL) {
if ((errno != ENOENT) ||
(mkdir(locname, 0755) < 0)) {
errf(strerror(errno));
}
}
(void) closedir(dir);
(void) mkdir(dirname(category_file()), 0755);
}
(void) yyparse();
if (verbose) {
(void) printf("All done.\n");
}
return (warnings ? 1 : 0);
}

View File

@ -0,0 +1,172 @@
/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* $FreeBSD$
*/
/*
* POSIX localedef.
*/
/* Common header files. */
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <sys/types.h>
extern int com_char;
extern int esc_char;
extern int mb_cur_max;
extern int mb_cur_min;
extern int last_kw;
extern int verbose;
extern int yydebug;
extern int lineno;
extern int undefok; /* mostly ignore undefined symbols */
extern int warnok;
extern int warnings;
int yylex(void);
void yyerror(const char *);
void errf(const char *, ...);
void warn(const char *, ...);
int putl_category(const char *, FILE *);
int wr_category(void *, size_t, FILE *);
FILE *open_category(void);
void close_category(FILE *);
void copy_category(char *);
const char *category_name(void);
int get_category(void);
int get_symbol(void);
int get_escaped(int);
int get_wide(void);
void reset_scanner(const char *);
void scan_to_eol(void);
void add_wcs(wchar_t);
void add_tok(int);
wchar_t *get_wcs(void);
/* charmap.c - CHARMAP handling */
void init_charmap(void);
void add_charmap(char *, int);
void add_charmap_undefined(char *);
void add_charmap_posix(void);
void add_charmap_range(char *, char *, int);
void add_charmap_char(char *name, int val);
int lookup_charmap(const char *, wchar_t *);
int check_charmap_undefined(char *);
int check_charmap(wchar_t);
/* collate.o - LC_COLLATE handling */
typedef struct collelem collelem_t;
typedef struct collsym collsym_t;
void init_collate(void);
void define_collsym(char *);
void define_collelem(char *, wchar_t *);
void add_order_directive(void);
void add_order_bit(int);
void dump_collate(void);
collsym_t *lookup_collsym(char *);
collelem_t *lookup_collelem(char *);
void start_order_collelem(collelem_t *);
void start_order_undefined(void);
void start_order_symbol(char *);
void start_order_char(wchar_t);
void start_order_ellipsis(void);
void end_order_collsym(collsym_t *);
void end_order(void);
void add_weight(int32_t, int);
void add_weights(int32_t *);
void add_weight_num(int);
void add_order_collelem(collelem_t *);
void add_order_collsym(collsym_t *);
void add_order_char(wchar_t);
void add_order_ignore(void);
void add_order_ellipsis(void);
void add_order_symbol(char *);
void add_order_subst(void);
void add_subst_char(wchar_t);
void add_subst_collsym(collsym_t *);
void add_subst_collelem(collelem_t *);
void add_subst_symbol(char *);
int32_t get_weight(int32_t, int);
wchar_t * wsncpy(wchar_t *, const wchar_t *, size_t);
/* ctype.c - LC_CTYPE handling */
void init_ctype(void);
void add_ctype(int);
void add_ctype_range(int);
void add_width(int, int);
void add_width_range(int, int, int);
void add_caseconv(int, int);
void dump_ctype(void);
/* messages.c - LC_MESSAGES handling */
void init_messages(void);
void add_message(wchar_t *);
void dump_messages(void);
/* monetary.c - LC_MONETARY handling */
void init_monetary(void);
void add_monetary_str(wchar_t *);
void add_monetary_num(int);
void reset_monetary_group(void);
void add_monetary_group(int);
void dump_monetary(void);
/* numeric.c - LC_NUMERIC handling */
void init_numeric(void);
void add_numeric_str(wchar_t *);
void reset_numeric_group(void);
void add_numeric_group(int);
void dump_numeric(void);
/* time.c - LC_TIME handling */
void init_time(void);
void add_time_str(wchar_t *);
void reset_time_list(void);
void add_time_list(wchar_t *);
void check_time_list(void);
void dump_time(void);
/* wide.c - Wide character handling. */
int to_wide(wchar_t *, const char *);
int to_mbs(char *, wchar_t);
int to_mb(char *, wchar_t);
char *to_mb_string(const wchar_t *);
void set_wide_encoding(const char *);
void werr(const char *, ...);
const char *get_wide_encoding(void);
int max_wide(void);
//#define _(x) gettext(x)
#define INTERR fprintf(stderr,"internal fault (%s:%d)", __FILE__, __LINE__)

View File

@ -0,0 +1,121 @@
/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* LC_MESSAGES database generation routines for localedef.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <string.h>
#include <unistd.h>
#include "localedef.h"
#include "parser.h"
#include "lmessages.h"
static struct lc_messages_T msgs;
void
init_messages(void)
{
(void) memset(&msgs, 0, sizeof (msgs));
}
void
add_message(wchar_t *wcs)
{
char *str;
if ((str = to_mb_string(wcs)) == NULL) {
INTERR;
return;
}
free(wcs);
switch (last_kw) {
case T_YESSTR:
msgs.yesstr = str;
break;
case T_NOSTR:
msgs.nostr = str;
break;
case T_YESEXPR:
msgs.yesexpr = str;
break;
case T_NOEXPR:
msgs.noexpr = str;
break;
default:
free(str);
INTERR;
break;
}
}
void
dump_messages(void)
{
FILE *f;
char *ptr;
if (msgs.yesstr == NULL) {
warn("missing field 'yesstr'");
msgs.yesstr = "";
}
if (msgs.nostr == NULL) {
warn("missing field 'nostr'");
msgs.nostr = "";
}
/*
* CLDR likes to add : separated lists for yesstr and nostr.
* Legacy Solaris code does not seem to grok this. Fix it.
*/
if ((ptr = strchr(msgs.yesstr, ':')) != NULL)
*ptr = 0;
if ((ptr = strchr(msgs.nostr, ':')) != NULL)
*ptr = 0;
if ((f = open_category()) == NULL) {
return;
}
if ((putl_category(msgs.yesexpr, f) == EOF) ||
(putl_category(msgs.noexpr, f) == EOF) ||
(putl_category(msgs.yesstr, f) == EOF) ||
(putl_category(msgs.nostr, f) == EOF)) {
return;
}
close_category(f);
}

View File

@ -0,0 +1,216 @@
/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* LC_MONETARY database generation routines for localedef.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <string.h>
#include <unistd.h>
#include "localedef.h"
#include "parser.h"
#include "lmonetary.h"
static struct lc_monetary_T mon;
void
init_monetary(void)
{
(void) memset(&mon, 0, sizeof (mon));
}
void
add_monetary_str(wchar_t *wcs)
{
char *str;
if ((str = to_mb_string(wcs)) == NULL) {
INTERR;
return;
}
free(wcs);
switch (last_kw) {
case T_INT_CURR_SYMBOL:
mon.int_curr_symbol = str;
break;
case T_CURRENCY_SYMBOL:
mon.currency_symbol = str;
break;
case T_MON_DECIMAL_POINT:
mon.mon_decimal_point = str;
break;
case T_MON_THOUSANDS_SEP:
mon.mon_thousands_sep = str;
break;
case T_POSITIVE_SIGN:
mon.positive_sign = str;
break;
case T_NEGATIVE_SIGN:
mon.negative_sign = str;
break;
default:
free(str);
INTERR;
break;
}
}
void
add_monetary_num(int n)
{
char *str = NULL;
(void) asprintf(&str, "%d", n);
if (str == NULL) {
fprintf(stderr, "out of memory");
return;
}
switch (last_kw) {
case T_INT_FRAC_DIGITS:
mon.int_frac_digits = str;
break;
case T_FRAC_DIGITS:
mon.frac_digits = str;
break;
case T_P_CS_PRECEDES:
mon.p_cs_precedes = str;
break;
case T_P_SEP_BY_SPACE:
mon.p_sep_by_space = str;
break;
case T_N_CS_PRECEDES:
mon.n_cs_precedes = str;
break;
case T_N_SEP_BY_SPACE:
mon.n_sep_by_space = str;
break;
case T_P_SIGN_POSN:
mon.p_sign_posn = str;
break;
case T_N_SIGN_POSN:
mon.n_sign_posn = str;
break;
case T_INT_P_CS_PRECEDES:
mon.int_p_cs_precedes = str;
break;
case T_INT_N_CS_PRECEDES:
mon.int_n_cs_precedes = str;
break;
case T_INT_P_SEP_BY_SPACE:
mon.int_p_sep_by_space = str;
break;
case T_INT_N_SEP_BY_SPACE:
mon.int_n_sep_by_space = str;
break;
case T_INT_P_SIGN_POSN:
mon.int_p_sign_posn = str;
break;
case T_INT_N_SIGN_POSN:
mon.int_n_sign_posn = str;
break;
case T_MON_GROUPING:
mon.mon_grouping = str;
break;
default:
INTERR;
break;
}
}
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wcast-qual"
void
reset_monetary_group(void)
{
free((char *)mon.mon_grouping);
mon.mon_grouping = NULL;
}
void
add_monetary_group(int n)
{
char *s = NULL;
if (mon.mon_grouping == NULL) {
(void) asprintf(&s, "%d", n);
} else {
(void) asprintf(&s, "%s;%d", mon.mon_grouping, n);
}
if (s == NULL)
fprintf(stderr, "out of memory");
free((char *)mon.mon_grouping);
mon.mon_grouping = s;
}
#pragma GCC diagnostic pop
void
dump_monetary(void)
{
FILE *f;
if ((f = open_category()) == NULL) {
return;
}
if ((putl_category(mon.int_curr_symbol, f) == EOF) ||
(putl_category(mon.currency_symbol, f) == EOF) ||
(putl_category(mon.mon_decimal_point, f) == EOF) ||
(putl_category(mon.mon_thousands_sep, f) == EOF) ||
(putl_category(mon.mon_grouping, f) == EOF) ||
(putl_category(mon.positive_sign, f) == EOF) ||
(putl_category(mon.negative_sign, f) == EOF) ||
(putl_category(mon.int_frac_digits, f) == EOF) ||
(putl_category(mon.frac_digits, f) == EOF) ||
(putl_category(mon.p_cs_precedes, f) == EOF) ||
(putl_category(mon.p_sep_by_space, f) == EOF) ||
(putl_category(mon.n_cs_precedes, f) == EOF) ||
(putl_category(mon.n_sep_by_space, f) == EOF) ||
(putl_category(mon.p_sign_posn, f) == EOF) ||
(putl_category(mon.n_sign_posn, f) == EOF) ||
(putl_category(mon.int_p_cs_precedes, f) == EOF) ||
(putl_category(mon.int_n_cs_precedes, f) == EOF) ||
(putl_category(mon.int_p_sep_by_space, f) == EOF) ||
(putl_category(mon.int_n_sep_by_space, f) == EOF) ||
(putl_category(mon.int_p_sign_posn, f) == EOF) ||
(putl_category(mon.int_n_sign_posn, f) == EOF)) {
return;
}
close_category(f);
}

124
usr.bin/localedef/numeric.c Normal file
View File

@ -0,0 +1,124 @@
/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* LC_NUMERIC database generation routines for localedef.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <string.h>
#include <unistd.h>
#include "localedef.h"
#include "parser.h"
#include "lnumeric.h"
static struct lc_numeric_T numeric;
void
init_numeric(void)
{
(void) memset(&numeric, 0, sizeof (numeric));
}
void
add_numeric_str(wchar_t *wcs)
{
char *str;
if ((str = to_mb_string(wcs)) == NULL) {
INTERR;
return;
}
free(wcs);
switch (last_kw) {
case T_DECIMAL_POINT:
numeric.decimal_point = str;
break;
case T_THOUSANDS_SEP:
numeric.thousands_sep = str;
break;
default:
free(str);
INTERR;
break;
}
}
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wcast-qual"
void
reset_numeric_group(void)
{
free((char *)numeric.grouping);
numeric.grouping = NULL;
}
void
add_numeric_group(int n)
{
char *s;
if (numeric.grouping == NULL) {
(void) asprintf(&s, "%d", n);
} else {
(void) asprintf(&s, "%s;%d", numeric.grouping, n);
}
if (s == NULL)
fprintf(stderr, "out of memory");
free((char *)numeric.grouping);
numeric.grouping = s;
}
#pragma GCC diagnostic pop
void
dump_numeric(void)
{
FILE *f;
if ((f = open_category()) == NULL) {
return;
}
if ((putl_category(numeric.decimal_point, f) == EOF) ||
(putl_category(numeric.thousands_sep, f) == EOF) ||
(putl_category(numeric.grouping, f) == EOF)) {
return;
}
close_category(f);
}

706
usr.bin/localedef/parser.y Normal file
View File

@ -0,0 +1,706 @@
%{
/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* POSIX localedef grammar.
*/
#include <wchar.h>
#include <stdio.h>
#include <limits.h>
#include "localedef.h"
%}
%union {
int num;
wchar_t wc;
char *token;
collsym_t *collsym;
collelem_t *collelem;
}
%token T_CODE_SET
%token T_MB_CUR_MAX
%token T_MB_CUR_MIN
%token T_COM_CHAR
%token T_ESC_CHAR
%token T_LT
%token T_GT
%token T_NL
%token T_SEMI
%token T_COMMA
%token T_ELLIPSIS
%token T_RPAREN
%token T_LPAREN
%token T_QUOTE
%token T_NULL
%token T_WS
%token T_END
%token T_COPY
%token T_CHARMAP
%token T_WIDTH
%token T_CTYPE
%token T_ISUPPER
%token T_ISLOWER
%token T_ISALPHA
%token T_ISDIGIT
%token T_ISPUNCT
%token T_ISXDIGIT
%token T_ISSPACE
%token T_ISPRINT
%token T_ISGRAPH
%token T_ISBLANK
%token T_ISCNTRL
%token T_ISALNUM
%token T_ISSPECIAL
%token T_ISPHONOGRAM
%token T_ISIDEOGRAM
%token T_ISENGLISH
%token T_ISNUMBER
%token T_TOUPPER
%token T_TOLOWER
%token T_COLLATE
%token T_COLLATING_SYMBOL
%token T_COLLATING_ELEMENT
%token T_ORDER_START
%token T_ORDER_END
%token T_FORWARD
%token T_BACKWARD
%token T_POSITION
%token T_FROM
%token T_UNDEFINED
%token T_IGNORE
%token T_MESSAGES
%token T_YESSTR
%token T_NOSTR
%token T_YESEXPR
%token T_NOEXPR
%token T_MONETARY
%token T_INT_CURR_SYMBOL
%token T_CURRENCY_SYMBOL
%token T_MON_DECIMAL_POINT
%token T_MON_THOUSANDS_SEP
%token T_POSITIVE_SIGN
%token T_NEGATIVE_SIGN
%token T_MON_GROUPING
%token T_INT_FRAC_DIGITS
%token T_FRAC_DIGITS
%token T_P_CS_PRECEDES
%token T_P_SEP_BY_SPACE
%token T_N_CS_PRECEDES
%token T_N_SEP_BY_SPACE
%token T_P_SIGN_POSN
%token T_N_SIGN_POSN
%token T_INT_P_CS_PRECEDES
%token T_INT_N_CS_PRECEDES
%token T_INT_P_SEP_BY_SPACE
%token T_INT_N_SEP_BY_SPACE
%token T_INT_P_SIGN_POSN
%token T_INT_N_SIGN_POSN
%token T_NUMERIC
%token T_DECIMAL_POINT
%token T_THOUSANDS_SEP
%token T_GROUPING
%token T_TIME
%token T_ABDAY
%token T_DAY
%token T_ABMON
%token T_MON
%token T_ERA
%token T_ERA_D_FMT
%token T_ERA_T_FMT
%token T_ERA_D_T_FMT
%token T_ALT_DIGITS
%token T_D_T_FMT
%token T_D_FMT
%token T_T_FMT
%token T_AM_PM
%token T_T_FMT_AMPM
%token T_DATE_FMT
%token <wc> T_CHAR
%token <token> T_NAME
%token <num> T_NUMBER
%token <token> T_SYMBOL
%token <collsym> T_COLLSYM
%token <collelem> T_COLLELEM
%%
localedef : setting_list categories
| categories
;
string : T_QUOTE charlist T_QUOTE
| T_QUOTE T_QUOTE
;
charlist : charlist T_CHAR
{
add_wcs($2);
}
| T_CHAR
{
add_wcs($1);
}
;
setting_list : setting_list setting
| setting
;
setting : T_COM_CHAR T_CHAR T_NL
{
com_char = $2;
}
| T_ESC_CHAR T_CHAR T_NL
{
esc_char = $2;
}
| T_MB_CUR_MAX T_NUMBER T_NL
{
mb_cur_max = $2;
}
| T_MB_CUR_MIN T_NUMBER T_NL
{
mb_cur_min = $2;
}
| T_CODE_SET string T_NL
{
wchar_t *w = get_wcs();
set_wide_encoding(to_mb_string(w));
free(w);
}
| T_CODE_SET T_NAME T_NL
{
set_wide_encoding($2);
}
;
copycat : T_COPY T_NAME T_NL
{
copy_category($2);
}
| T_COPY string T_NL
{
wchar_t *w = get_wcs();
copy_category(to_mb_string(w));
free(w);
}
;
categories : categories category
| category
;
category : charmap
| messages
| monetary
| ctype
| collate
| numeric
| time
;
charmap : T_CHARMAP T_NL charmap_list T_END T_CHARMAP T_NL
| T_WIDTH T_NL width_list T_END T_WIDTH T_NL
;
charmap_list : charmap_list charmap_entry
| charmap_entry
;
charmap_entry : T_SYMBOL T_CHAR
{
add_charmap($1, $2);
scan_to_eol();
}
| T_SYMBOL T_ELLIPSIS T_SYMBOL T_CHAR
{
add_charmap_range($1, $3, $4);
scan_to_eol();
}
| T_NL
;
width_list : width_list width_entry
| width_entry
;
width_entry : T_CHAR T_NUMBER T_NL
{
add_width($1, $2);
}
| T_SYMBOL T_NUMBER T_NL
{
add_charmap_undefined($1);
}
| T_CHAR T_ELLIPSIS T_CHAR T_NUMBER T_NL
{
add_width_range($1, $3, $4);
}
| T_SYMBOL T_ELLIPSIS T_SYMBOL T_NUMBER T_NL
{
add_charmap_undefined($1);
add_charmap_undefined($3);
}
| T_CHAR T_ELLIPSIS T_SYMBOL T_NUMBER T_NL
{
add_width($1, $4);
add_charmap_undefined($3);
}
| T_SYMBOL T_ELLIPSIS T_CHAR T_NUMBER T_NL
{
add_width($3, $4);
add_charmap_undefined($1);
}
| T_NL
;
ctype : T_CTYPE T_NL ctype_list T_END T_CTYPE T_NL
{
dump_ctype();
}
| T_CTYPE T_NL copycat T_END T_CTYPE T_NL
;
ctype_list : ctype_list ctype_kw
| ctype_kw
;
ctype_kw : T_ISUPPER cc_list T_NL
| T_ISLOWER cc_list T_NL
| T_ISALPHA cc_list T_NL
| T_ISDIGIT cc_list T_NL
| T_ISPUNCT cc_list T_NL
| T_ISXDIGIT cc_list T_NL
| T_ISSPACE cc_list T_NL
| T_ISPRINT cc_list T_NL
| T_ISGRAPH cc_list T_NL
| T_ISBLANK cc_list T_NL
| T_ISCNTRL cc_list T_NL
| T_ISALNUM cc_list T_NL
| T_ISSPECIAL cc_list T_NL
| T_ISENGLISH cc_list T_NL
| T_ISNUMBER cc_list T_NL
| T_ISIDEOGRAM cc_list T_NL
| T_ISPHONOGRAM cc_list T_NL
| T_TOUPPER conv_list T_NL
| T_TOLOWER conv_list T_NL
;
cc_list : cc_list T_SEMI T_CHAR
{
add_ctype($3);
}
| cc_list T_SEMI T_SYMBOL
{
add_charmap_undefined($3);
}
| cc_list T_SEMI T_ELLIPSIS T_SEMI T_CHAR
{
/* note that the endpoints *must* be characters */
add_ctype_range($5);
}
| T_CHAR
{
add_ctype($1);
}
| T_SYMBOL
{
add_charmap_undefined($1);
}
;
conv_list : conv_list T_SEMI conv_pair
| conv_pair
;
conv_pair : T_LPAREN T_CHAR T_COMMA T_CHAR T_RPAREN
{
add_caseconv($2, $4);
}
| T_LPAREN T_SYMBOL T_COMMA T_CHAR T_RPAREN
{
add_charmap_undefined($2);
}
| T_LPAREN T_SYMBOL T_COMMA T_SYMBOL T_RPAREN
{
add_charmap_undefined($2);
add_charmap_undefined($4);
}
| T_LPAREN T_CHAR T_COMMA T_SYMBOL T_RPAREN
{
add_charmap_undefined($4);
}
;
collate : T_COLLATE T_NL coll_order T_END T_COLLATE T_NL
{
dump_collate();
}
| T_COLLATE T_NL coll_optional coll_order T_END T_COLLATE T_NL
{
dump_collate();
}
| T_COLLATE T_NL copycat T_END T_COLLATE T_NL
;
coll_optional : coll_optional coll_symbols
| coll_optional coll_elements
| coll_symbols
| coll_elements
;
coll_symbols : T_COLLATING_SYMBOL T_SYMBOL T_NL
{
define_collsym($2);
}
;
coll_elements : T_COLLATING_ELEMENT T_SYMBOL T_FROM string T_NL
{
define_collelem($2, get_wcs());
}
;
coll_order : T_ORDER_START T_NL order_list T_ORDER_END T_NL
{
/* If no order list supplied default to one forward */
add_order_bit(T_FORWARD);
add_order_directive();
}
| T_ORDER_START order_args T_NL order_list T_ORDER_END T_NL
;
order_args : order_args T_SEMI order_arg
{
add_order_directive();
}
| order_arg
{
add_order_directive();
}
;
order_arg : order_arg T_COMMA order_dir
| order_dir
;
order_dir : T_FORWARD
{
add_order_bit(T_FORWARD);
}
| T_BACKWARD
{
add_order_bit(T_BACKWARD);
}
| T_POSITION
{
add_order_bit(T_POSITION);
}
;
order_list : order_list order_item
| order_item
;
order_item : T_COLLSYM T_NL
{
end_order_collsym($1);
}
| order_itemkw T_NL
{
end_order();
}
| order_itemkw order_weights T_NL
{
end_order();
}
;
order_itemkw : T_CHAR
{
start_order_char($1);
}
| T_ELLIPSIS
{
start_order_ellipsis();
}
| T_COLLELEM
{
start_order_collelem($1);
}
| T_UNDEFINED
{
start_order_undefined();
}
| T_SYMBOL
{
start_order_symbol($1);
}
;
order_weights : order_weights T_SEMI order_weight
| order_weights T_SEMI
| order_weight
;
order_weight : T_COLLELEM
{
add_order_collelem($1);
}
| T_COLLSYM
{
add_order_collsym($1);
}
| T_CHAR
{
add_order_char($1);
}
| T_ELLIPSIS
{
add_order_ellipsis();
}
| T_IGNORE
{
add_order_ignore();
}
| T_SYMBOL
{
add_order_symbol($1);
}
| T_QUOTE order_str T_QUOTE
{
add_order_subst();
}
;
order_str : order_str order_stritem
| order_stritem
;
order_stritem : T_CHAR
{
add_subst_char($1);
}
| T_COLLSYM
{
add_subst_collsym($1);
}
| T_COLLELEM
{
add_subst_collelem($1);
}
| T_SYMBOL
{
add_subst_symbol($1);
}
;
messages : T_MESSAGES T_NL messages_list T_END T_MESSAGES T_NL
{
dump_messages();
}
| T_MESSAGES T_NL copycat T_END T_MESSAGES T_NL
;
messages_list : messages_list messages_item
| messages_item
;
messages_kw : T_YESSTR
| T_NOSTR
| T_YESEXPR
| T_NOEXPR
;
messages_item : messages_kw string T_NL
{
add_message(get_wcs());
}
;
monetary : T_MONETARY T_NL monetary_list T_END T_MONETARY T_NL
{
dump_monetary();
}
| T_MONETARY T_NL copycat T_END T_MONETARY T_NL
;
monetary_list : monetary_list monetary_kw
| monetary_kw
;
monetary_strkw : T_INT_CURR_SYMBOL
| T_CURRENCY_SYMBOL
| T_MON_DECIMAL_POINT
| T_MON_THOUSANDS_SEP
| T_POSITIVE_SIGN
| T_NEGATIVE_SIGN
;
monetary_numkw : T_INT_FRAC_DIGITS
| T_FRAC_DIGITS
| T_P_CS_PRECEDES
| T_P_SEP_BY_SPACE
| T_N_CS_PRECEDES
| T_N_SEP_BY_SPACE
| T_P_SIGN_POSN
| T_N_SIGN_POSN
| T_INT_P_CS_PRECEDES
| T_INT_N_CS_PRECEDES
| T_INT_P_SEP_BY_SPACE
| T_INT_N_SEP_BY_SPACE
| T_INT_P_SIGN_POSN
| T_INT_N_SIGN_POSN
;
monetary_kw : monetary_strkw string T_NL
{
add_monetary_str(get_wcs());
}
| monetary_numkw T_NUMBER T_NL
{
add_monetary_num($2);
}
| T_MON_GROUPING mon_group_list T_NL
;
mon_group_list : T_NUMBER
{
reset_monetary_group();
add_monetary_group($1);
}
| mon_group_list T_SEMI T_NUMBER
{
add_monetary_group($3);
}
;
numeric : T_NUMERIC T_NL numeric_list T_END T_NUMERIC T_NL
{
dump_numeric();
}
| T_NUMERIC T_NL copycat T_END T_NUMERIC T_NL
;
numeric_list : numeric_list numeric_item
| numeric_item
;
numeric_item : numeric_strkw string T_NL
{
add_numeric_str(get_wcs());
}
| T_GROUPING group_list T_NL
;
numeric_strkw : T_DECIMAL_POINT
| T_THOUSANDS_SEP
;
group_list : T_NUMBER
{
reset_numeric_group();
add_numeric_group($1);
}
| group_list T_SEMI T_NUMBER
{
add_numeric_group($3);
}
;
time : T_TIME T_NL time_kwlist T_END T_TIME T_NL
{
dump_time();
}
| T_TIME T_NL copycat T_END T_NUMERIC T_NL
;
time_kwlist : time_kwlist time_kw
| time_kw
;
time_kw : time_strkw string T_NL
{
add_time_str(get_wcs());
}
| time_listkw time_list T_NL
{
check_time_list();
}
;
time_listkw : T_ABDAY
| T_DAY
| T_ABMON
| T_MON
| T_ERA
| T_ALT_DIGITS
| T_AM_PM
;
time_strkw : T_ERA_D_T_FMT
| T_ERA_T_FMT
| T_ERA_D_FMT
| T_D_T_FMT
| T_D_FMT
| T_T_FMT
| T_T_FMT_AMPM
| T_DATE_FMT
;
time_list : time_list T_SEMI string
{
add_time_list(get_wcs());
}
| string
{
reset_time_list();
add_time_list(get_wcs());
}
;

866
usr.bin/localedef/scanner.c Normal file
View File

@ -0,0 +1,866 @@
/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This file contains the "scanner", which tokenizes the input files
* for localedef for processing by the higher level grammar processor.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <limits.h>
#include <string.h>
#include <wchar.h>
#include <sys/types.h>
#include <assert.h>
#include "localedef.h"
#include "parser.h"
int com_char = '#';
int esc_char = '\\';
int mb_cur_min = 1;
int mb_cur_max = 1;
int lineno = 1;
int warnings = 0;
int is_stdin = 1;
FILE *input;
static int nextline;
//static FILE *input = stdin;
static const char *filename = "<stdin>";
static int instring = 0;
static int escaped = 0;
/*
* Token space ... grows on demand.
*/
static char *token = NULL;
static int tokidx;
static int toksz = 0;
static int hadtok = 0;
/*
* Wide string space ... grows on demand.
*/
static wchar_t *widestr = NULL;
static int wideidx = 0;
static int widesz = 0;
/*
* The last keyword seen. This is useful to trigger the special lexer rules
* for "copy" and also collating symbols and elements.
*/
int last_kw = 0;
static int category = T_END;
static struct token {
int id;
const char *name;
} keywords[] = {
{ T_COM_CHAR, "comment_char" },
{ T_ESC_CHAR, "escape_char" },
{ T_END, "END" },
{ T_COPY, "copy" },
{ T_MESSAGES, "LC_MESSAGES" },
{ T_YESSTR, "yesstr" },
{ T_YESEXPR, "yesexpr" },
{ T_NOSTR, "nostr" },
{ T_NOEXPR, "noexpr" },
{ T_MONETARY, "LC_MONETARY" },
{ T_INT_CURR_SYMBOL, "int_curr_symbol" },
{ T_CURRENCY_SYMBOL, "currency_symbol" },
{ T_MON_DECIMAL_POINT, "mon_decimal_point" },
{ T_MON_THOUSANDS_SEP, "mon_thousands_sep" },
{ T_POSITIVE_SIGN, "positive_sign" },
{ T_NEGATIVE_SIGN, "negative_sign" },
{ T_MON_GROUPING, "mon_grouping" },
{ T_INT_FRAC_DIGITS, "int_frac_digits" },
{ T_FRAC_DIGITS, "frac_digits" },
{ T_P_CS_PRECEDES, "p_cs_precedes" },
{ T_P_SEP_BY_SPACE, "p_sep_by_space" },
{ T_N_CS_PRECEDES, "n_cs_precedes" },
{ T_N_SEP_BY_SPACE, "n_sep_by_space" },
{ T_P_SIGN_POSN, "p_sign_posn" },
{ T_N_SIGN_POSN, "n_sign_posn" },
{ T_INT_P_CS_PRECEDES, "int_p_cs_precedes" },
{ T_INT_N_CS_PRECEDES, "int_n_cs_precedes" },
{ T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" },
{ T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" },
{ T_INT_P_SIGN_POSN, "int_p_sign_posn" },
{ T_INT_N_SIGN_POSN, "int_n_sign_posn" },
{ T_COLLATE, "LC_COLLATE" },
{ T_COLLATING_SYMBOL, "collating-symbol" },
{ T_COLLATING_ELEMENT, "collating-element" },
{ T_FROM, "from" },
{ T_ORDER_START, "order_start" },
{ T_ORDER_END, "order_end" },
{ T_FORWARD, "forward" },
{ T_BACKWARD, "backward" },
{ T_POSITION, "position" },
{ T_IGNORE, "IGNORE" },
{ T_UNDEFINED, "UNDEFINED" },
{ T_NUMERIC, "LC_NUMERIC" },
{ T_DECIMAL_POINT, "decimal_point" },
{ T_THOUSANDS_SEP, "thousands_sep" },
{ T_GROUPING, "grouping" },
{ T_TIME, "LC_TIME" },
{ T_ABDAY, "abday" },
{ T_DAY, "day" },
{ T_ABMON, "abmon" },
{ T_MON, "mon" },
{ T_D_T_FMT, "d_t_fmt" },
{ T_D_FMT, "d_fmt" },
{ T_T_FMT, "t_fmt" },
{ T_AM_PM, "am_pm" },
{ T_T_FMT_AMPM, "t_fmt_ampm" },
{ T_ERA, "era" },
{ T_ERA_D_FMT, "era_d_fmt" },
{ T_ERA_T_FMT, "era_t_fmt" },
{ T_ERA_D_T_FMT, "era_d_t_fmt" },
{ T_ALT_DIGITS, "alt_digits" },
{ T_CTYPE, "LC_CTYPE" },
{ T_ISUPPER, "upper" },
{ T_ISLOWER, "lower" },
{ T_ISALPHA, "alpha" },
{ T_ISDIGIT, "digit" },
{ T_ISPUNCT, "punct" },
{ T_ISXDIGIT, "xdigit" },
{ T_ISSPACE, "space" },
{ T_ISPRINT, "print" },
{ T_ISGRAPH, "graph" },
{ T_ISBLANK, "blank" },
{ T_ISCNTRL, "cntrl" },
/*
* These entries are local additions, and not specified by
* TOG. Note that they are not guaranteed to be accurate for
* all locales, and so applications should not depend on them.
*/
{ T_ISSPECIAL, "special" },
{ T_ISENGLISH, "english" },
{ T_ISPHONOGRAM, "phonogram" },
{ T_ISIDEOGRAM, "ideogram" },
{ T_ISNUMBER, "number" },
/*
* We have to support this in the grammar, but it would be a
* syntax error to define a character as one of these without
* also defining it as an alpha or digit. We ignore it in our
* parsing.
*/
{ T_ISALNUM, "alnum" },
{ T_TOUPPER, "toupper" },
{ T_TOLOWER, "tolower" },
/*
* These are keywords used in the charmap file. Note that
* Solaris orginally used angle brackets to wrap some of them,
* but we removed that to simplify our parser. The first of these
* items are "global items."
*/
{ T_CHARMAP, "CHARMAP" },
{ T_WIDTH, "WIDTH" },
{ -1, NULL },
};
/*
* These special words are only used in a charmap file, enclosed in <>.
*/
static struct token symwords[] = {
{ T_COM_CHAR, "comment_char" },
{ T_ESC_CHAR, "escape_char" },
{ T_CODE_SET, "code_set_name" },
{ T_MB_CUR_MAX, "mb_cur_max" },
{ T_MB_CUR_MIN, "mb_cur_min" },
{ -1, NULL },
};
static int categories[] = {
T_CHARMAP,
T_CTYPE,
T_COLLATE,
T_MESSAGES,
T_MONETARY,
T_NUMERIC,
T_TIME,
T_WIDTH,
0
};
void
reset_scanner(const char *fname)
{
if (fname == NULL) {
filename = "<stdin>";
is_stdin = 1;
} else {
if (!is_stdin)
(void) fclose(input);
if ((input = fopen(fname, "r")) == NULL) {
perror("fopen");
exit(4);
} else {
is_stdin = 0;
}
filename = fname;
}
com_char = '#';
esc_char = '\\';
instring = 0;
escaped = 0;
lineno = 1;
nextline = 1;
tokidx = 0;
wideidx = 0;
}
#define hex(x) \
(isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10))
#define isodigit(x) ((x >= '0') && (x <= '7'))
static int
scanc(void)
{
int c;
if (is_stdin)
c = getc(stdin);
else
c = getc(input);
lineno = nextline;
if (c == '\n') {
nextline++;
}
return (c);
}
static void
unscanc(int c)
{
if (c == '\n') {
nextline--;
}
if (ungetc(c, is_stdin ? stdin : input) < 0) {
yyerror("ungetc failed");
}
}
static int
scan_hex_byte(void)
{
int c1, c2;
int v;
c1 = scanc();
if (!isxdigit(c1)) {
yyerror("malformed hex digit");
return (0);
}
c2 = scanc();
if (!isxdigit(c2)) {
yyerror("malformed hex digit");
return (0);
}
v = ((hex(c1) << 4) | hex(c2));
return (v);
}
static int
scan_dec_byte(void)
{
int c1, c2, c3;
int b;
c1 = scanc();
if (!isdigit(c1)) {
yyerror("malformed decimal digit");
return (0);
}
b = c1 - '0';
c2 = scanc();
if (!isdigit(c2)) {
yyerror("malformed decimal digit");
return (0);
}
b *= 10;
b += (c2 - '0');
c3 = scanc();
if (!isdigit(c3)) {
unscanc(c3);
} else {
b *= 10;
b += (c3 - '0');
}
return (b);
}
static int
scan_oct_byte(void)
{
int c1, c2, c3;
int b;
b = 0;
c1 = scanc();
if (!isodigit(c1)) {
yyerror("malformed octal digit");
return (0);
}
b = c1 - '0';
c2 = scanc();
if (!isodigit(c2)) {
yyerror("malformed octal digit");
return (0);
}
b *= 8;
b += (c2 - '0');
c3 = scanc();
if (!isodigit(c3)) {
unscanc(c3);
} else {
b *= 8;
b += (c3 - '0');
}
return (b);
}
void
add_tok(int c)
{
if ((tokidx + 1) >= toksz) {
toksz += 64;
if ((token = realloc(token, toksz)) == NULL) {
yyerror("out of memory");
tokidx = 0;
toksz = 0;
return;
}
}
token[tokidx++] = (char)c;
token[tokidx] = 0;
}
void
add_wcs(wchar_t c)
{
if ((wideidx + 1) >= widesz) {
widesz += 64;
widestr = realloc(widestr, (widesz * sizeof (wchar_t)));
if (widestr == NULL) {
yyerror("out of memory");
wideidx = 0;
widesz = 0;
return;
}
}
widestr[wideidx++] = c;
widestr[wideidx] = 0;
}
wchar_t *
get_wcs(void)
{
wchar_t *ws = widestr;
wideidx = 0;
widestr = NULL;
widesz = 0;
if (ws == NULL) {
if ((ws = wcsdup(L"")) == NULL) {
yyerror("out of memory");
}
}
return (ws);
}
static int
get_byte(void)
{
int c;
if ((c = scanc()) != esc_char) {
unscanc(c);
return (EOF);
}
c = scanc();
switch (c) {
case 'd':
case 'D':
return (scan_dec_byte());
case 'x':
case 'X':
return (scan_hex_byte());
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
/* put the character back so we can get it */
unscanc(c);
return (scan_oct_byte());
default:
unscanc(c);
unscanc(esc_char);
return (EOF);
}
}
int
get_escaped(int c)
{
switch (c) {
case 'n':
return ('\n');
case 'r':
return ('\r');
case 't':
return ('\t');
case 'f':
return ('\f');
case 'v':
return ('\v');
case 'b':
return ('\b');
case 'a':
return ('\a');
default:
return (c);
}
}
int
get_wide(void)
{
static char mbs[MB_LEN_MAX + 1] = "";
static int mbi = 0;
int c;
wchar_t wc;
if (mb_cur_max >= (int)sizeof (mbs)) {
yyerror("max multibyte character size too big");
mbi = 0;
return (T_NULL);
}
for (;;) {
if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) {
/*
* end of the byte sequence reached, but no
* valid wide decoding. fatal error.
*/
mbi = 0;
yyerror("not a valid character encoding");
return (T_NULL);
}
mbs[mbi++] = c;
mbs[mbi] = 0;
/* does it decode? */
if (to_wide(&wc, mbs) >= 0) {
break;
}
}
mbi = 0;
if ((category != T_CHARMAP) && (category != T_WIDTH)) {
if (check_charmap(wc) < 0) {
yyerror("no symbolic name for character");
return (T_NULL);
}
}
yylval.wc = wc;
return (T_CHAR);
}
int
get_symbol(void)
{
int c;
while ((c = scanc()) != EOF) {
if (escaped) {
escaped = 0;
if (c == '\n')
continue;
add_tok(get_escaped(c));
continue;
}
if (c == esc_char) {
escaped = 1;
continue;
}
if (c == '\n') { /* well that's strange! */
yyerror("unterminated symbolic name");
continue;
}
if (c == '>') { /* end of symbol */
/*
* This restarts the token from the beginning
* the next time we scan a character. (This
* token is complete.)
*/
if (token == NULL) {
yyerror("missing symbolic name");
return (T_NULL);
}
tokidx = 0;
/*
* A few symbols are handled as keywords outside
* of the normal categories.
*/
if (category == T_END) {
int i;
for (i = 0; symwords[i].name != 0; i++) {
if (strcmp(token, symwords[i].name) ==
0) {
last_kw = symwords[i].id;
return (last_kw);
}
}
}
/*
* Contextual rule: Only literal characters are
* permitted in CHARMAP. Anywhere else the symbolic
* forms are fine.
*/
if ((category != T_CHARMAP) &&
(lookup_charmap(token, &yylval.wc)) != -1) {
return (T_CHAR);
}
if ((yylval.collsym = lookup_collsym(token)) != NULL) {
return (T_COLLSYM);
}
if ((yylval.collelem = lookup_collelem(token)) !=
NULL) {
return (T_COLLELEM);
}
/* its an undefined symbol */
yylval.token = strdup(token);
token = NULL;
toksz = 0;
tokidx = 0;
return (T_SYMBOL);
}
add_tok(c);
}
yyerror("unterminated symbolic name");
return (EOF);
}
int
get_category(void)
{
return (category);
}
static int
consume_token(void)
{
int len = tokidx;
int i;
tokidx = 0;
if (token == NULL)
return (T_NULL);
/*
* this one is special, because we don't want it to alter the
* last_kw field.
*/
if (strcmp(token, "...") == 0) {
return (T_ELLIPSIS);
}
/* search for reserved words first */
for (i = 0; keywords[i].name; i++) {
int j;
if (strcmp(keywords[i].name, token) != 0) {
continue;
}
last_kw = keywords[i].id;
/* clear the top level category if we're done with it */
if (last_kw == T_END) {
category = T_END;
}
/* set the top level category if we're changing */
for (j = 0; categories[j]; j++) {
if (categories[j] != last_kw)
continue;
category = last_kw;
}
return (keywords[i].id);
}
/* maybe its a numeric constant? */
if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) {
char *eptr;
yylval.num = strtol(token, &eptr, 10);
if (*eptr != 0)
yyerror("malformed number");
return (T_NUMBER);
}
/*
* A single lone character is treated as a character literal.
* To avoid duplication of effort, we stick in the charmap.
*/
if (len == 1) {
yylval.wc = token[0];
return (T_CHAR);
}
/* anything else is treated as a symbolic name */
yylval.token = strdup(token);
token = NULL;
toksz = 0;
tokidx = 0;
return (T_NAME);
}
void
scan_to_eol(void)
{
int c;
while ((c = scanc()) != '\n') {
if (c == EOF) {
/* end of file without newline! */
errf("missing newline");
return;
}
}
assert(c == '\n');
}
int
yylex(void)
{
int c;
while ((c = scanc()) != EOF) {
/* special handling for quoted string */
if (instring) {
if (escaped) {
escaped = 0;
/* if newline, just eat and forget it */
if (c == '\n')
continue;
if (strchr("xXd01234567", c)) {
unscanc(c);
unscanc(esc_char);
return (get_wide());
}
yylval.wc = get_escaped(c);
return (T_CHAR);
}
if (c == esc_char) {
escaped = 1;
continue;
}
switch (c) {
case '<':
return (get_symbol());
case '>':
/* oops! should generate syntax error */
return (T_GT);
case '"':
instring = 0;
return (T_QUOTE);
default:
yylval.wc = c;
return (T_CHAR);
}
}
/* escaped characters first */
if (escaped) {
escaped = 0;
if (c == '\n') {
/* eat the newline */
continue;
}
hadtok = 1;
if (tokidx) {
/* an escape mid-token is nonsense */
return (T_NULL);
}
/* numeric escapes are treated as wide characters */
if (strchr("xXd01234567", c)) {
unscanc(c);
unscanc(esc_char);
return (get_wide());
}
add_tok(get_escaped(c));
continue;
}
/* if it is the escape charter itself note it */
if (c == esc_char) {
escaped = 1;
continue;
}
/* remove from the comment char to end of line */
if (c == com_char) {
while (c != '\n') {
if ((c = scanc()) == EOF) {
/* end of file without newline! */
return (EOF);
}
}
assert(c == '\n');
if (!hadtok) {
/*
* If there were no tokens on this line,
* then just pretend it didn't exist at all.
*/
continue;
}
hadtok = 0;
return (T_NL);
}
if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) {
/*
* These are all token delimiters. If there
* is a token already in progress, we need to
* process it.
*/
unscanc(c);
return (consume_token());
}
switch (c) {
case '\n':
if (!hadtok) {
/*
* If the line was completely devoid of tokens,
* then just ignore it.
*/
continue;
}
/* we're starting a new line, reset the token state */
hadtok = 0;
return (T_NL);
case ',':
hadtok = 1;
return (T_COMMA);
case ';':
hadtok = 1;
return (T_SEMI);
case '(':
hadtok = 1;
return (T_LPAREN);
case ')':
hadtok = 1;
return (T_RPAREN);
case '>':
hadtok = 1;
return (T_GT);
case '<':
/* symbol start! */
hadtok = 1;
return (get_symbol());
case ' ':
case '\t':
/* whitespace, just ignore it */
continue;
case '"':
hadtok = 1;
instring = 1;
return (T_QUOTE);
default:
hadtok = 1;
add_tok(c);
continue;
}
}
return (EOF);
}
void
yyerror(const char *msg)
{
(void) fprintf(stderr, "%s: %d: error: %s\n",
filename, lineno, msg);
exit(4);
}
void
errf(const char *fmt, ...)
{
char *msg;
va_list va;
va_start(va, fmt);
(void) vasprintf(&msg, fmt, va);
va_end(va);
(void) fprintf(stderr, "%s: %d: error: %s\n",
filename, lineno, msg);
free(msg);
exit(4);
}
void
warn(const char *fmt, ...)
{
char *msg;
va_list va;
va_start(va, fmt);
(void) vasprintf(&msg, fmt, va);
va_end(va);
(void) fprintf(stderr, "%s: %d: warning: %s\n",
filename, lineno, msg);
free(msg);
warnings++;
if (!warnok)
exit(4);
}

280
usr.bin/localedef/time.c Normal file
View File

@ -0,0 +1,280 @@
/*
* Copyright 2010 Nexenta Systems, Inc. All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* LC_TIME database generation routines for localedef.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <sys/types.h>
#include <string.h>
#include <unistd.h>
#include "localedef.h"
#include "parser.h"
#include "timelocal.h"
struct lc_time_T tm;
void
init_time(void)
{
(void) memset(&tm, 0, sizeof (tm));
}
void
add_time_str(wchar_t *wcs)
{
char *str;
if ((str = to_mb_string(wcs)) == NULL) {
INTERR;
return;
}
free(wcs);
switch (last_kw) {
case T_D_T_FMT:
tm.c_fmt = str;
break;
case T_D_FMT:
tm.x_fmt = str;
break;
case T_T_FMT:
tm.X_fmt = str;
break;
case T_T_FMT_AMPM:
tm.ampm_fmt = str;
break;
case T_DATE_FMT:
/*
* This one is a Solaris extension, Too bad date just
* doesn't use %c, which would be simpler.
*/
tm.date_fmt = str;
break;
case T_ERA_D_FMT:
case T_ERA_T_FMT:
case T_ERA_D_T_FMT:
/* Silently ignore it. */
break;
default:
free(str);
INTERR;
break;
}
}
static void
add_list(const char *ptr[], char *str, int limit)
{
int i;
for (i = 0; i < limit; i++) {
if (ptr[i] == NULL) {
ptr[i] = str;
return;
}
}
fprintf(stderr,"too many list elements");
}
void
add_time_list(wchar_t *wcs)
{
char *str;
if ((str = to_mb_string(wcs)) == NULL) {
INTERR;
return;
}
free(wcs);
switch (last_kw) {
case T_ABMON:
add_list(tm.mon, str, 12);
break;
case T_MON:
add_list(tm.month, str, 12);
break;
case T_ABDAY:
add_list(tm.wday, str, 7);
break;
case T_DAY:
add_list(tm.weekday, str, 7);
break;
case T_AM_PM:
if (tm.am == NULL) {
tm.am = str;
} else if (tm.pm == NULL) {
tm.pm = str;
} else {
fprintf(stderr,"too many list elements");
}
break;
case T_ALT_DIGITS:
case T_ERA:
free(str);
break;
default:
free(str);
INTERR;
break;
}
}
void
check_time_list(void)
{
switch (last_kw) {
case T_ABMON:
if (tm.mon[11] != NULL)
return;
break;
case T_MON:
if (tm.month[11] != NULL)
return;
break;
case T_ABDAY:
if (tm.wday[6] != NULL)
return;
break;
case T_DAY:
if (tm.weekday[6] != NULL)
return;
break;
case T_AM_PM:
if (tm.pm != NULL)
return;
break;
case T_ERA:
case T_ALT_DIGITS:
return;
default:
fprintf(stderr,"unknown list");
break;
}
fprintf(stderr,"too few items in list (%d)", last_kw);
}
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wcast-qual"
void
reset_time_list(void)
{
int i;
switch (last_kw) {
case T_ABMON:
for (i = 0; i < 12; i++) {
free((char *)tm.mon[i]);
tm.mon[i] = NULL;
}
break;
case T_MON:
for (i = 0; i < 12; i++) {
free((char *)tm.month[i]);
tm.month[i] = NULL;
}
break;
case T_ABDAY:
for (i = 0; i < 7; i++) {
free((char *)tm.wday[i]);
tm.wday[i] = NULL;
}
break;
case T_DAY:
for (i = 0; i < 7; i++) {
free((char *)tm.weekday[i]);
tm.weekday[i] = NULL;
}
break;
case T_AM_PM:
free((char *)tm.am);
tm.am = NULL;
free((char *)tm.pm);
tm.pm = NULL;
break;
}
}
#pragma GCC diagnostic pop
void
dump_time(void)
{
FILE *f;
int i;
if ((f = open_category()) == NULL) {
return;
}
for (i = 0; i < 12; i++) {
if (putl_category(tm.mon[i], f) == EOF) {
return;
}
}
for (i = 0; i < 12; i++) {
if (putl_category(tm.month[i], f) == EOF) {
return;
}
}
for (i = 0; i < 7; i++) {
if (putl_category(tm.wday[i], f) == EOF) {
return;
}
}
for (i = 0; i < 7; i++) {
if (putl_category(tm.weekday[i], f) == EOF) {
return;
}
}
/*
* NOTE: If date_fmt is not specified, then we'll default to
* using the %c for date. This is reasonable for most
* locales, although for reasons that I don't understand
* Solaris historically has had a seperate format for date.
*/
if ((putl_category(tm.X_fmt, f) == EOF) ||
(putl_category(tm.x_fmt, f) == EOF) ||
(putl_category(tm.c_fmt, f) == EOF) ||
(putl_category(tm.am, f) == EOF) ||
(putl_category(tm.pm, f) == EOF) ||
(putl_category(tm.date_fmt ? tm.date_fmt : tm.c_fmt, f) == EOF) ||
(putl_category(tm.ampm_fmt, f) == EOF)) {
return;
}
close_category(f);
}

669
usr.bin/localedef/wide.c Normal file
View File

@ -0,0 +1,669 @@
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
* Copyright 2015 John Marino <draco@marino.st>
*
* This source code is derived from the illumos localedef command, and
* provided under BSD-style license terms by Nexenta Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* The functions in this file convert from the standard multibyte forms
* to the wide character forms used internally by libc. Unfortunately,
* this approach means that we need a method for each and every encoding.
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <ctype.h>
#include <stdlib.h>
#include <wchar.h>
#include <string.h>
#include <sys/types.h>
#include "localedef.h"
static int towide_none(wchar_t *, const char *, unsigned);
static int towide_utf8(wchar_t *, const char *, unsigned);
static int towide_big5(wchar_t *, const char *, unsigned);
static int towide_gbk(wchar_t *, const char *, unsigned);
static int towide_gb2312(wchar_t *, const char *, unsigned);
static int towide_gb18030(wchar_t *, const char *, unsigned);
static int towide_mskanji(wchar_t *, const char *, unsigned);
static int towide_euccn(wchar_t *, const char *, unsigned);
static int towide_eucjp(wchar_t *, const char *, unsigned);
static int towide_euckr(wchar_t *, const char *, unsigned);
static int towide_euctw(wchar_t *, const char *, unsigned);
static int tomb_none(char *, wchar_t);
static int tomb_utf8(char *, wchar_t);
static int tomb_mbs(char *, wchar_t);
static int (*_towide)(wchar_t *, const char *, unsigned) = towide_none;
static int (*_tomb)(char *, wchar_t) = tomb_none;
static const char *_encoding = "NONE";
static int _nbits = 7;
/*
* Table of supported encodings. We only bother to list the multibyte
* encodings here, because single byte locales are handed by "NONE".
*/
static struct {
const char *name;
/* the name that the underlying libc implemenation uses */
const char *cname;
/* the maximum number of bits required for priorities */
int nbits;
int (*towide)(wchar_t *, const char *, unsigned);
int (*tomb)(char *, wchar_t);
} mb_encodings[] = {
/*
* UTF8 values max out at 0x1fffff (although in theory there could
* be later extensions, but it won't happen.) This means we only need
* 21 bits to be able to encode the entire range of priorities.
*/
{ "UTF-8", "UTF-8", 21, towide_utf8, tomb_utf8 },
{ "UTF8", "UTF-8", 21, towide_utf8, tomb_utf8 },
{ "utf8", "UTF-8", 21, towide_utf8, tomb_utf8 },
{ "utf-8", "UTF-8", 21, towide_utf8, tomb_utf8 },
{ "EUC-CN", "EUC-CN", 16, towide_euccn, tomb_mbs },
{ "eucCN", "EUC-CN", 16, towide_euccn, tomb_mbs },
/*
* Becuase the 3-byte form of EUC-JP use the same leading byte,
* only 17 bits required to provide unique priorities. (The low
* bit of that first byte is set.) By setting this value low,
* we can get by with only 3 bytes in the strxfrm expansion.
*/
{ "EUC-JP", "EUC-JP", 17, towide_eucjp, tomb_mbs },
{ "eucJP", "EUC-JP", 17, towide_eucjp, tomb_mbs },
{ "EUC-KR", "EUC-KR", 16, towide_euckr, tomb_mbs },
{ "eucKR", "EUC-KR", 16, towide_euckr, tomb_mbs },
/*
* EUC-TW uses 2 bytes most of the time, but 4 bytes if the
* high order byte is 0x8E. However, with 4 byte encodings,
* the third byte will be A0-B0. So we only need to consider
* the lower order 24 bits for collation.
*/
{ "EUC-TW", "EUC-TW", 24, towide_euctw, tomb_mbs },
{ "eucTW", "EUC-TW", 24, towide_euctw, tomb_mbs },
{ "MS_Kanji", "MSKanji", 16, towide_mskanji, tomb_mbs },
{ "MSKanji", "MSKanji", 16, towide_mskanji, tomb_mbs },
{ "PCK", "MSKanji", 16, towide_mskanji, tomb_mbs },
{ "SJIS", "MSKanji", 16, towide_mskanji, tomb_mbs },
{ "Shift_JIS", "MSKanji", 16, towide_mskanji, tomb_mbs },
{ "BIG5", "BIG5", 16, towide_big5, tomb_mbs },
{ "big5", "BIG5", 16, towide_big5, tomb_mbs },
{ "Big5", "BIG5", 16, towide_big5, tomb_mbs },
{ "GBK", "GBK", 16, towide_gbk, tomb_mbs },
/*
* GB18030 can get away with just 31 bits. This is because the
* high order bit is always set for 4 byte values, and the
* at least one of the other bits in that 4 byte value will
* be non-zero.
*/
{ "GB18030", "GB18030", 31, towide_gb18030, tomb_mbs },
/*
* This should probably be an aliase for euc-cn, or vice versa.
*/
{ "GB2312", "GB2312", 16, towide_gb2312, tomb_mbs },
{ NULL, NULL, 0, 0, 0 },
};
static char *
show_mb(const char *mb)
{
static char buf[64];
/* ASCII stuff we just print */
if (isascii(*mb) && isgraph(*mb)) {
buf[0] = *mb;
buf[1] = 0;
return (buf);
}
buf[0] = 0;
while (*mb != 0) {
char scr[8];
(void) snprintf(scr, sizeof (scr), "\\x%02x", *mb);
(void) strlcat(buf, scr, sizeof (buf));
mb++;
}
return (buf);
}
static char *widemsg;
void
werr(const char *fmt, ...)
{
char *msg;
va_list va;
va_start(va, fmt);
(void) vasprintf(&msg, fmt, va);
va_end(va);
free(widemsg);
widemsg = msg;
}
/*
* This is used for 8-bit encodings.
*/
int
towide_none(wchar_t *c, const char *mb, unsigned n __unused)
{
if (mb_cur_max != 1) {
werr("invalid or unsupported multibyte locale");
return (-1);
}
*c = (uint8_t)*mb;
return (1);
}
int
tomb_none(char *mb, wchar_t wc)
{
if (mb_cur_max != 1) {
werr("invalid or unsupported multibyte locale");
return (-1);
}
*(uint8_t *)mb = (wc & 0xff);
mb[1] = 0;
return (1);
}
/*
* UTF-8 stores wide characters in UTF-32 form.
*/
int
towide_utf8(wchar_t *wc, const char *mb, unsigned n)
{
wchar_t c;
int nb;
int lv; /* lowest legal value */
int i;
const uint8_t *s = (const uint8_t *)mb;
c = *s;
if ((c & 0x80) == 0) {
/* 7-bit ASCII */
*wc = c;
return (1);
} else if ((c & 0xe0) == 0xc0) {
/* u80-u7ff - two bytes encoded */
nb = 2;
lv = 0x80;
c &= ~0xe0;
} else if ((c & 0xf0) == 0xe0) {
/* u800-uffff - three bytes encoded */
nb = 3;
lv = 0x800;
c &= ~0xf0;
} else if ((c & 0xf8) == 0xf0) {
/* u1000-u1fffff - four bytes encoded */
nb = 4;
lv = 0x1000;
c &= ~0xf8;
} else {
/* 5 and 6 byte encodings are not legal unicode */
werr("utf8 encoding too large (%s)", show_mb(mb));
return (-1);
}
if (nb > (int)n) {
werr("incomplete utf8 sequence (%s)", show_mb(mb));
return (-1);
}
for (i = 1; i < nb; i++) {
if (((s[i]) & 0xc0) != 0x80) {
werr("illegal utf8 byte (%x)", s[i]);
return (-1);
}
c <<= 6;
c |= (s[i] & 0x3f);
}
if (c < lv) {
werr("illegal redundant utf8 encoding (%s)", show_mb(mb));
return (-1);
}
*wc = c;
return (nb);
}
int
tomb_utf8(char *mb, wchar_t wc)
{
uint8_t *s = (uint8_t *)mb;
uint8_t msk;
int cnt;
int i;
if (wc <= 0x7f) {
s[0] = wc & 0x7f;
s[1] = 0;
return (1);
}
if (wc <= 0x7ff) {
cnt = 2;
msk = 0xc0;
} else if (wc <= 0xffff) {
cnt = 3;
msk = 0xe0;
} else if (wc <= 0x1fffff) {
cnt = 4;
msk = 0xf0;
} else {
werr("illegal uf8 char (%x)", wc);
return (-1);
}
for (i = cnt - 1; i; i--) {
s[i] = (wc & 0x3f) | 0x80;
wc >>= 6;
}
s[0] = (msk) | wc;
s[cnt] = 0;
return (cnt);
}
/*
* Several encodings share a simplistic dual byte encoding. In these
* forms, they all indicate that a two byte sequence is to be used if
* the first byte has its high bit set. They all store this simple
* encoding as a 16-bit value, although a great many of the possible
* code points are not used in most character sets. This gives a possible
* set of just over 32,000 valid code points.
*
* 0x00 - 0x7f - 1 byte encoding
* 0x80 - 0x7fff - illegal
* 0x8000 - 0xffff - 2 byte encoding
*/
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wcast-qual"
static int
towide_dbcs(wchar_t *wc, const char *mb, unsigned n)
{
wchar_t c;
c = *(uint8_t *)mb;
if ((c & 0x80) == 0) {
/* 7-bit */
*wc = c;
return (1);
}
if (n < 2) {
werr("incomplete character sequence (%s)", show_mb(mb));
return (-1);
}
/* Store both bytes as a single 16-bit wide. */
c <<= 8;
c |= (uint8_t)(mb[1]);
*wc = c;
return (2);
}
/*
* Most multibyte locales just convert the wide character to the multibyte
* form by stripping leading null bytes, and writing the 32-bit quantity
* in big-endian order.
*/
int
tomb_mbs(char *mb, wchar_t wc)
{
uint8_t *s = (uint8_t *)mb;
int n = 0, c;
if ((wc & 0xff000000U) != 0) {
n = 4;
} else if ((wc & 0x00ff0000U) != 0) {
n = 3;
} else if ((wc & 0x0000ff00U) != 0) {
n = 2;
} else {
n = 1;
}
c = n;
while (n) {
n--;
s[n] = wc & 0xff;
wc >>= 8;
}
/* ensure null termination */
s[c] = 0;
return (c);
}
/*
* big5 is a simple dual byte character set.
*/
int
towide_big5(wchar_t *wc, const char *mb, unsigned n)
{
return (towide_dbcs(wc, mb, n));
}
/*
* GBK encodes wides in the same way that big5 does, the high order
* bit of the first byte indicates a double byte character.
*/
int
towide_gbk(wchar_t *wc, const char *mb, unsigned n)
{
return (towide_dbcs(wc, mb, n));
}
/*
* GB2312 is another DBCS. Its cleaner than others in that the second
* byte does not encode ASCII, but it supports characters.
*/
int
towide_gb2312(wchar_t *wc, const char *mb, unsigned n)
{
return (towide_dbcs(wc, mb, n));
}
/*
* GB18030. This encodes as 8, 16, or 32-bits.
* 7-bit values are in 1 byte, 4 byte sequences are used when
* the second byte encodes 0x30-39 and all other sequences are 2 bytes.
*/
int
towide_gb18030(wchar_t *wc, const char *mb, unsigned n)
{
wchar_t c;
c = *(uint8_t *)mb;
if ((c & 0x80) == 0) {
/* 7-bit */
*wc = c;
return (1);
}
if (n < 2) {
werr("incomplete character sequence (%s)", show_mb(mb));
return (-1);
}
/* pull in the second byte */
c <<= 8;
c |= (uint8_t)(mb[1]);
if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) {
if (n < 4) {
werr("incomplete 4-byte character sequence (%s)",
show_mb(mb));
return (-1);
}
c <<= 8;
c |= (uint8_t)(mb[2]);
c <<= 8;
c |= (uint8_t)(mb[3]);
*wc = c;
return (4);
}
*wc = c;
return (2);
}
/*
* MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it
* also has a range of single byte characters above 0x80. (0xa1-0xdf).
*/
int
towide_mskanji(wchar_t *wc, const char *mb, unsigned n)
{
wchar_t c;
c = *(uint8_t *)mb;
if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) {
/* 7-bit */
*wc = c;
return (1);
}
if (n < 2) {
werr("incomplete character sequence (%s)", show_mb(mb));
return (-1);
}
/* Store both bytes as a single 16-bit wide. */
c <<= 8;
c |= (uint8_t)(mb[1]);
*wc = c;
return (2);
}
/*
* EUC forms. EUC encodings are "variable". FreeBSD carries some additional
* variable data to encode these, but we're going to treat each as independent
* instead. Its the only way we can sensibly move forward.
*
* Note that the way in which the different EUC forms vary is how wide
* CS2 and CS3 are and what the first byte of them is.
*/
static int
towide_euc_impl(wchar_t *wc, const char *mb, unsigned n,
uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width)
{
int i;
int width = 2;
wchar_t c;
c = *(uint8_t *)mb;
/*
* All variations of EUC encode 7-bit ASCII as one byte, and use
* additional bytes for more than that.
*/
if ((c & 0x80) == 0) {
/* 7-bit */
*wc = c;
return (1);
}
/*
* All EUC variants reserve 0xa1-0xff to identify CS1, which
* is always two bytes wide. Note that unused CS will be zero,
* and that cannot be true because we know that the high order
* bit must be set.
*/
if (c >= 0xa1) {
width = 2;
} else if (c == cs2) {
width = cs2width;
} else if (c == cs3) {
width = cs3width;
}
if ((int)n < width) {
werr("incomplete character sequence (%s)", show_mb(mb));
return (-1);
}
for (i = 1; i < width; i++) {
/* pull in the next byte */
c <<= 8;
c |= (uint8_t)(mb[i]);
}
*wc = c;
return (width);
}
#pragma GCC diagnostic pop
/*
* EUC-CN encodes as follows:
*
* Code set 0 (ASCII): 0x21-0x7E
* Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE
* Code set 2: unused
* Code set 3: unused
*/
int
towide_euccn(wchar_t *wc, const char *mb, unsigned n)
{
return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
}
/*
* EUC-JP encodes as follows:
*
* Code set 0 (ASCII or JIS X 0201-1976 Roman): 0x21-0x7E
* Code set 1 (JIS X 0208): 0xA1A1-0xFEFE
* Code set 2 (half-width katakana): 0x8EA1-0x8EDF
* Code set 3 (JIS X 0212-1990): 0x8FA1A1-0x8FFEFE
*/
int
towide_eucjp(wchar_t *wc, const char *mb, unsigned n)
{
return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3));
}
/*
* EUC-KR encodes as follows:
*
* Code set 0 (ASCII or KS C 5636-1993): 0x21-0x7E
* Code set 1 (KS C 5601-1992): 0xA1A1-0xFEFE
* Code set 2: unused
* Code set 3: unused
*/
int
towide_euckr(wchar_t *wc, const char *mb, unsigned n)
{
return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0));
}
/*
* EUC-TW encodes as follows:
*
* Code set 0 (ASCII): 0x21-0x7E
* Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE
* Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE
* Code set 3: unused
*/
int
towide_euctw(wchar_t *wc, const char *mb, unsigned n)
{
return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0));
}
/*
* Public entry points.
*/
int
to_wide(wchar_t *wc, const char *mb)
{
/* this won't fail hard */
return (_towide(wc, mb, strlen(mb)));
}
int
to_mb(char *mb, wchar_t wc)
{
int rv;
if ((rv = _tomb(mb, wc)) < 0) {
errf(widemsg);
free(widemsg);
widemsg = NULL;
}
return (rv);
}
char *
to_mb_string(const wchar_t *wcs)
{
char *mbs;
char *ptr;
int len;
mbs = malloc((wcslen(wcs) * mb_cur_max) + 1);
if (mbs == NULL) {
errf("out of memory");
return (NULL);
}
ptr = mbs;
while (*wcs) {
if ((len = to_mb(ptr, *wcs)) < 0) {
INTERR;
free(mbs);
return (NULL);
}
wcs++;
ptr += len;
}
*ptr = 0;
return (mbs);
}
void
set_wide_encoding(const char *encoding)
{
int i;
_towide = towide_none;
_tomb = tomb_none;
_encoding = "NONE";
_nbits = 8;
for (i = 0; mb_encodings[i].name; i++) {
if (strcasecmp(encoding, mb_encodings[i].name) == 0) {
_towide = mb_encodings[i].towide;
_tomb = mb_encodings[i].tomb;
_encoding = mb_encodings[i].cname;
_nbits = mb_encodings[i].nbits;
break;
}
}
}
const char *
get_wide_encoding(void)
{
return (_encoding);
}
int
max_wide(void)
{
return ((int)((1U << _nbits) - 1));
}