diff --git a/usr.bin/Makefile b/usr.bin/Makefile index cf5604c79a06..e8df8e013ecd 100644 --- a/usr.bin/Makefile +++ b/usr.bin/Makefile @@ -84,6 +84,7 @@ SUBDIR= ${_addr2line} \ lesskey \ limits \ locale \ + localedef \ lock \ lockf \ logger \ diff --git a/usr.bin/localedef/Makefile b/usr.bin/localedef/Makefile new file mode 100644 index 000000000000..93c0515c85c6 --- /dev/null +++ b/usr.bin/localedef/Makefile @@ -0,0 +1,30 @@ +# $FreeBSD$ + +PROG= localedef +SRCS= charmap.c \ + collate.c \ + ctype.c \ + localedef.c \ + messages.c \ + monetary.c \ + numeric.c \ + parser.y \ + scanner.c \ + time.c \ + wide.c + +WARNS= 3 +${SRCS:M*.c}: parser.h +parser.h: parser.y +LIBADD= avl + +IGNORE_PRAGMA= yes + +CFLAGS+= -DNEED_SOLARIS_BOOLEAN +CFLAGS+= -I. -I${.CURDIR} +CFLAGS+= -I${.CURDIR}/../../lib/libc/locale +CFLAGS+= -I${.CURDIR}/../../lib/libc/stdtime +CFLAGS+= -I${.CURDIR}/../../sys/cddl/compat/opensolaris +CFLAGS+= -I${.CURDIR}/../../sys/cddl/contrib/opensolaris/uts/common + +.include diff --git a/usr.bin/localedef/README b/usr.bin/localedef/README new file mode 100644 index 000000000000..4d97371941c8 --- /dev/null +++ b/usr.bin/localedef/README @@ -0,0 +1,11 @@ +While there are tools called "localedef" in Solaris and Linux, this +tool does not share heritage with any other implementation. It was +written independently by Garrett D'Amore while employed at Nexenta +Systems, and thus carries the Nexenta Copyright. + +It was initially released under the CDDL license, but on 4 July 2014, +Nexenta reissued the source under the BSD 2-clause license. This +code is part of the Illumos project. + +see: +https://github.com/Nexenta/illumos-nexenta/commit/cf17542a37fc83d0ae093777e30d480423858c29 diff --git a/usr.bin/localedef/charmap.c b/usr.bin/localedef/charmap.c new file mode 100644 index 000000000000..203008b7cec4 --- /dev/null +++ b/usr.bin/localedef/charmap.c @@ -0,0 +1,364 @@ +/* + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * CHARMAP file handling for localedef. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "localedef.h" +#include "parser.h" + +static avl_tree_t cmap_sym; +static avl_tree_t cmap_wc; + +typedef struct charmap { + const char *name; + wchar_t wc; + avl_node_t avl_sym; + avl_node_t avl_wc; +} charmap_t; + + +/* + * Array of POSIX specific portable characters. + */ + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdiscarded-qualifiers" + +static const struct { + char *name; + int ch; +} portable_chars[] = { + { "NUL", '\0' }, + { "alert", '\a' }, + { "backspace", '\b' }, + { "tab", '\t' }, + { "carriage-return", '\r' }, + { "newline", '\n' }, + { "vertical-tab", '\v' }, + { "form-feed", '\f' }, + { "space", ' ' }, + { "exclamation-mark", '!' }, + { "quotation-mark", '"' }, + { "number-sign", '#' }, + { "dollar-sign", '$' }, + { "percent-sign", '%' }, + { "ampersand", '&' }, + { "apostrophe", '\'' }, + { "left-parenthesis", '(' }, + { "right-parenthesis", '(' }, + { "asterisk", '*' }, + { "plus-sign", '+' }, + { "comma", ','}, + { "hyphen-minus", '-' }, + { "hyphen", '-' }, + { "full-stop", '.' }, + { "period", '.' }, + { "slash", '/' }, + { "solidus", '/' }, + { "zero", '0' }, + { "one", '1' }, + { "two", '2' }, + { "three", '3' }, + { "four", '4' }, + { "five", '5' }, + { "six", '6' }, + { "seven", '7' }, + { "eight", '8' }, + { "nine", '9' }, + { "colon", ':' }, + { "semicolon", ';' }, + { "less-than-sign", '<' }, + { "equals-sign", '=' }, + { "greater-than-sign", '>' }, + { "question-mark", '?' }, + { "commercial-at", '@' }, + { "left-square-bracket", '[' }, + { "backslash", '\\' }, + { "reverse-solidus", '\\' }, + { "right-square-bracket", ']' }, + { "circumflex", '^' }, + { "circumflex-accent", '^' }, + { "low-line", '_' }, + { "underscore", '_' }, + { "grave-accent", '`' }, + { "left-brace", '{' }, + { "left-curly-bracket", '{' }, + { "vertical-line", '|' }, + { "right-brace", '}' }, + { "right-curly-bracket", '}' }, + { "tilde", '~' }, + { "A", 'A' }, + { "B", 'B' }, + { "C", 'C' }, + { "D", 'D' }, + { "E", 'E' }, + { "F", 'F' }, + { "G", 'G' }, + { "H", 'H' }, + { "I", 'I' }, + { "J", 'J' }, + { "K", 'K' }, + { "L", 'L' }, + { "M", 'M' }, + { "N", 'N' }, + { "O", 'O' }, + { "P", 'P' }, + { "Q", 'Q' }, + { "R", 'R' }, + { "S", 'S' }, + { "T", 'T' }, + { "U", 'U' }, + { "V", 'V' }, + { "W", 'W' }, + { "X", 'X' }, + { "Y", 'Y' }, + { "Z", 'Z' }, + { "a", 'a' }, + { "b", 'b' }, + { "c", 'c' }, + { "d", 'd' }, + { "e", 'e' }, + { "f", 'f' }, + { "g", 'g' }, + { "h", 'h' }, + { "i", 'i' }, + { "j", 'j' }, + { "k", 'k' }, + { "l", 'l' }, + { "m", 'm' }, + { "n", 'n' }, + { "o", 'o' }, + { "p", 'p' }, + { "q", 'q' }, + { "r", 'r' }, + { "s", 's' }, + { "t", 't' }, + { "u", 'u' }, + { "v", 'v' }, + { "w", 'w' }, + { "x", 'x' }, + { "y", 'y' }, + { "z", 'z' }, + { NULL, 0 } +}; + +#pragma GCC diagnostic pop + +static int +cmap_compare_sym(const void *n1, const void *n2) +{ + const charmap_t *c1 = n1; + const charmap_t *c2 = n2; + int rv; + + rv = strcmp(c1->name, c2->name); + return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); +} + +static int +cmap_compare_wc(const void *n1, const void *n2) +{ + const charmap_t *c1 = n1; + const charmap_t *c2 = n2; + + return ((c1->wc < c2->wc) ? -1 : (c1->wc > c2->wc) ? 1 : 0); +} + +void +init_charmap(void) +{ + avl_create(&cmap_sym, cmap_compare_sym, sizeof (charmap_t), + offsetof(charmap_t, avl_sym)); + + avl_create(&cmap_wc, cmap_compare_wc, sizeof (charmap_t), + offsetof(charmap_t, avl_wc)); +} + +static void +add_charmap_impl(char *sym, wchar_t wc, int nodups) +{ + charmap_t srch; + charmap_t *n = NULL; + avl_index_t where; + + srch.wc = wc; + srch.name = sym; + + /* + * also possibly insert the wide mapping, although note that there + * can only be one of these per wide character code. + */ + if ((wc != -1) && ((avl_find(&cmap_wc, &srch, &where)) == NULL)) { + if ((n = calloc(1, sizeof (*n))) == NULL) { + errf("out of memory"); + return; + } + n->wc = wc; + avl_insert(&cmap_wc, n, where); + } + + if (sym) { + if (avl_find(&cmap_sym, &srch, &where) != NULL) { + if (nodups) { + errf("duplicate character definition"); + } + return; + } + if ((n == NULL) && ((n = calloc(1, sizeof (*n))) == NULL)) { + errf("out of memory"); + return; + } + n->wc = wc; + n->name = sym; + + avl_insert(&cmap_sym, n, where); + } +} + +void +add_charmap(char *sym, int c) +{ + add_charmap_impl(sym, c, 1); +} + +void +add_charmap_undefined(char *sym) +{ + charmap_t srch; + charmap_t *cm = NULL; + + srch.name = sym; + cm = avl_find(&cmap_sym, &srch, NULL); + + if ((undefok == 0) && ((cm == NULL) || (cm->wc == -1))) { + warn("undefined symbol <%s>", sym); + add_charmap_impl(sym, -1, 0); + } else { + free(sym); + } +} + +void +add_charmap_range(char *s, char *e, int wc) +{ + int ls, le; + int si; + int sn, en; + int i; + + static const char *digits = "0123456789"; + + ls = strlen(s); + le = strlen(e); + + if (((si = strcspn(s, digits)) == 0) || (si == ls) || + (strncmp(s, e, si) != 0) || + ((int)strspn(s + si, digits) != (ls - si)) || + ((int)strspn(e + si, digits) != (le - si)) || + ((sn = atoi(s + si)) > ((en = atoi(e + si))))) { + errf("malformed charmap range"); + return; + } + + s[si] = 0; + + for (i = sn; i <= en; i++) { + char *nn; + (void) asprintf(&nn, "%s%0*u", s, ls - si, i); + if (nn == NULL) { + errf("out of memory"); + return; + } + + add_charmap_impl(nn, wc, 1); + wc++; + } + free(s); + free(e); +} + +void +add_charmap_char(char *name, int val) +{ + add_charmap_impl(name, val, 0); +} + +/* + * POSIX insists that certain entries be present, even when not in the + * orginal charmap file. + */ +void +add_charmap_posix(void) +{ + int i; + + for (i = 0; portable_chars[i].name; i++) { + add_charmap_char(portable_chars[i].name, portable_chars[i].ch); + } +} + +int +lookup_charmap(const char *sym, wchar_t *wc) +{ + charmap_t srch; + charmap_t *n; + + srch.name = sym; + n = avl_find(&cmap_sym, &srch, NULL); + if (n && n->wc != -1) { + if (wc) + *wc = n->wc; + return (0); + } + return (-1); +} + +int +check_charmap(wchar_t wc) +{ + charmap_t srch; + + srch.wc = wc; + return (avl_find(&cmap_wc, &srch, NULL) ? 0 : -1); +} diff --git a/usr.bin/localedef/collate.c b/usr.bin/localedef/collate.c new file mode 100644 index 000000000000..5a73cbae9173 --- /dev/null +++ b/usr.bin/localedef/collate.c @@ -0,0 +1,1299 @@ +/* + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * LC_COLLATE database generation routines for localedef. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include "localedef.h" +#include "parser.h" +#include "collate.h" + +/* + * Design notes. + * + * It will be extremely helpful to the reader if they have access to + * the localedef and locale file format specifications available. + * Latest versions of these are available from www.opengroup.org. + * + * The design for the collation code is a bit complex. The goal is a + * single collation database as described in collate.h (in + * libc/port/locale). However, there are some other tidbits: + * + * a) The substitution entries are now a directly indexable array. A + * priority elsewhere in the table is taken as an index into the + * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY) + * set. (The bit is cleared and the result is the index into the + * table. + * + * b) We eliminate duplicate entries into the substitution table. + * This saves a lot of space. + * + * c) The priorities for each level are "compressed", so that each + * sorting level has consecutively numbered priorities starting at 1. + * (O is reserved for the ignore priority.) This means sort levels + * which only have a few distinct priorities can represent the + * priority level in fewer bits, which makes the strxfrm output + * smaller. + * + * d) We record the total number of priorities so that strxfrm can + * figure out how many bytes to expand a numeric priority into. + * + * e) For the UNDEFINED pass (the last pass), we record the maximum + * number of bits needed to uniquely prioritize these entries, so that + * the last pass can also use smaller strxfrm output when possible. + * + * f) Priorities with the sign bit set are verboten. This works out + * because no active character set needs that bit to carry significant + * information once the character is in wide form. + * + * To process the entire data to make the database, we actually run + * multiple passes over the data. + * + * The first pass, which is done at parse time, identifies elements, + * substitutions, and such, and records them in priority order. As + * some priorities can refer to other priorities, using forward + * references, we use a table of references indicating whether the + * priority's value has been resolved, or whether it is still a + * reference. + * + * The second pass walks over all the items in priority order, noting + * that they are used directly, and not just an indirect reference. + * This is done by creating a "weight" structure for the item. The + * weights are stashed in an AVL tree sorted by relative "priority". + * + * The third pass walks over all the weight structures, in priority + * order, and assigns a new monotonically increasing (per sort level) + * weight value to them. These are the values that will actually be + * written to the file. + * + * The fourth pass just writes the data out. + */ + +/* + * In order to resolve the priorities, we create a table of priorities. + * Entries in the table can be in one of three states. + * + * UNKNOWN is for newly allocated entries, and indicates that nothing + * is known about the priority. (For example, when new entries are created + * for collating-symbols, this is the value assigned for them until the + * collating symbol's order has been determined. + * + * RESOLVED is used for an entry where the priority indicates the final + * numeric weight. + * + * REFER is used for entries that reference other entries. Typically + * this is used for forward references. A collating-symbol can never + * have this value. + * + * The "pass" field is used during final resolution to aid in detection + * of referencing loops. (For example depends on , but has its + * priority dependent on .) + */ +typedef enum { + UNKNOWN, /* priority is totally unknown */ + RESOLVED, /* priority value fully resolved */ + REFER /* priority is a reference (index) */ +} res_t; + +typedef struct weight { + int32_t pri; + int opt; + avl_node_t avl; +} weight_t; + +typedef struct priority { + res_t res; + int32_t pri; + int pass; + int lineno; +} collpri_t; + +#define NUM_WT collinfo.directive_count + +/* + * These are the abstract collating symbols, which are just a symbolic + * way to reference a priority. + */ +struct collsym { + char *name; + int32_t ref; + avl_node_t avl; +}; + +/* + * These are also abstract collating symbols, but we allow them to have + * different priorities at different levels. + */ +typedef struct collundef { + char *name; + int32_t ref[COLL_WEIGHTS_MAX]; + avl_node_t avl; +} collundef_t; + +/* + * These are called "chains" in libc. This records the fact that two + * more characters should be treated as a single collating entity when + * they appear together. For example, in Spanish gets collated + * as a character between and . + */ +struct collelem { + char *symbol; + wchar_t *expand; + int32_t ref[COLL_WEIGHTS_MAX]; + avl_node_t avl_bysymbol; + avl_node_t avl_byexpand; +}; + +/* + * Individual characters have a sequence of weights as well. + */ +typedef struct collchar { + wchar_t wc; + int32_t ref[COLL_WEIGHTS_MAX]; + avl_node_t avl; +} collchar_t; + +/* + * Substitution entries. The key is itself a priority. Note that + * when we create one of these, we *automatically* wind up with a + * fully resolved priority for the key, because creation of + * substitutions creates a resolved priority at the same time. + */ +typedef struct { + int32_t key; + int32_t ref[COLLATE_STR_LEN]; + avl_node_t avl; + avl_node_t avl_ref; +} subst_t; + +static avl_tree_t collsyms; +static avl_tree_t collundefs; +static avl_tree_t elem_by_symbol; +static avl_tree_t elem_by_expand; +static avl_tree_t collchars; +static avl_tree_t substs[COLL_WEIGHTS_MAX]; +static avl_tree_t substs_ref[COLL_WEIGHTS_MAX]; +static avl_tree_t weights[COLL_WEIGHTS_MAX]; +static int32_t nweight[COLL_WEIGHTS_MAX]; + +/* + * This is state tracking for the ellipsis token. Note that we start + * the initial values so that the ellipsis logic will think we got a + * magic starting value of NUL. It starts at minus one because the + * starting point is exclusive -- i.e. the starting point is not + * itself handled by the ellipsis code. + */ +static int currorder = EOF; +static int lastorder = EOF; +static collelem_t *currelem; +static collchar_t *currchar; +static collundef_t *currundef; +static wchar_t ellipsis_start = 0; +static int32_t ellipsis_weights[COLL_WEIGHTS_MAX]; + +/* + * We keep a running tally of weights. + */ +static int nextpri = 1; +static int nextsubst[COLL_WEIGHTS_MAX] = { 0 }; + +/* + * This array collects up the weights for each level. + */ +static int32_t order_weights[COLL_WEIGHTS_MAX]; +static int curr_weight = 0; +static int32_t subst_weights[COLLATE_STR_LEN]; +static int curr_subst = 0; + +/* + * Some initial priority values. + */ +static int32_t pri_undefined[COLL_WEIGHTS_MAX]; +static int32_t pri_ignore; + +static collate_info_t collinfo; + +static collpri_t *prilist = NULL; +static int numpri = 0; +static int maxpri = 0; + +static void start_order(int); + +static int32_t +new_pri(void) +{ + int i; + + if (numpri >= maxpri) { + maxpri = maxpri ? maxpri * 2 : 1024; + prilist = realloc(prilist, sizeof (collpri_t) * maxpri); + if (prilist == NULL) { + fprintf(stderr,"out of memory"); + return (-1); + } + for (i = numpri; i < maxpri; i++) { + prilist[i].res = UNKNOWN; + prilist[i].pri = 0; + prilist[i].pass = 0; + } + } + return (numpri++); +} + +static collpri_t * +get_pri(int32_t ref) +{ + if ((ref < 0) || (ref > numpri)) { + INTERR; + return (NULL); + } + return (&prilist[ref]); +} + +static void +set_pri(int32_t ref, int32_t v, res_t res) +{ + collpri_t *pri; + + pri = get_pri(ref); + + if ((res == REFER) && ((v < 0) || (v >= numpri))) { + INTERR; + } + + /* Resolve self references */ + if ((res == REFER) && (ref == v)) { + v = nextpri; + res = RESOLVED; + } + + if (pri->res != UNKNOWN) { + warn("repeated item in order list (first on %d)", + pri->lineno); + return; + } + pri->lineno = lineno; + pri->pri = v; + pri->res = res; +} + +static int32_t +resolve_pri(int32_t ref) +{ + collpri_t *pri; + static int32_t pass = 0; + + pri = get_pri(ref); + pass++; + while (pri->res == REFER) { + if (pri->pass == pass) { + /* report a line with the circular symbol */ + lineno = pri->lineno; + fprintf(stderr,"circular reference in order list"); + return (-1); + } + if ((pri->pri < 0) || (pri->pri >= numpri)) { + INTERR; + return (-1); + } + pri->pass = pass; + pri = &prilist[pri->pri]; + } + + if (pri->res == UNKNOWN) { + return (-1); + } + if (pri->res != RESOLVED) + INTERR; + + return (pri->pri); +} + +static int +weight_compare(const void *n1, const void *n2) +{ + int32_t k1 = ((const weight_t *)n1)->pri; + int32_t k2 = ((const weight_t *)n2)->pri; + + return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); +} + +static int +collsym_compare(const void *n1, const void *n2) +{ + const collsym_t *c1 = n1; + const collsym_t *c2 = n2; + int rv; + + rv = strcmp(c1->name, c2->name); + return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); +} + +static int +collundef_compare(const void *n1, const void *n2) +{ + const collundef_t *c1 = n1; + const collundef_t *c2 = n2; + int rv; + + rv = strcmp(c1->name, c2->name); + return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); +} + +static int +element_compare_symbol(const void *n1, const void *n2) +{ + const collelem_t *c1 = n1; + const collelem_t *c2 = n2; + int rv; + + rv = strcmp(c1->symbol, c2->symbol); + return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); +} + +static int +element_compare_expand(const void *n1, const void *n2) +{ + const collelem_t *c1 = n1; + const collelem_t *c2 = n2; + int rv; + + rv = wcscmp(c1->expand, c2->expand); + return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); +} + +static int +collchar_compare(const void *n1, const void *n2) +{ + wchar_t k1 = ((const collchar_t *)n1)->wc; + wchar_t k2 = ((const collchar_t *)n2)->wc; + + return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); +} + +static int +subst_compare(const void *n1, const void *n2) +{ + int32_t k1 = ((const subst_t *)n1)->key; + int32_t k2 = ((const subst_t *)n2)->key; + + return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0); +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + +static int +subst_compare_ref(const void *n1, const void *n2) +{ + int32_t *c1 = ((subst_t *)n1)->ref; + int32_t *c2 = ((subst_t *)n2)->ref; + int rv; + + rv = wcscmp((wchar_t *)c1, (wchar_t *)c2); + return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); +} + +#pragma GCC diagnostic pop + +void +init_collate(void) +{ + int i; + + avl_create(&collsyms, collsym_compare, sizeof (collsym_t), + offsetof(collsym_t, avl)); + + avl_create(&collundefs, collundef_compare, sizeof (collsym_t), + offsetof(collundef_t, avl)); + + avl_create(&elem_by_symbol, element_compare_symbol, sizeof (collelem_t), + offsetof(collelem_t, avl_bysymbol)); + avl_create(&elem_by_expand, element_compare_expand, sizeof (collelem_t), + offsetof(collelem_t, avl_byexpand)); + + avl_create(&collchars, collchar_compare, sizeof (collchar_t), + offsetof(collchar_t, avl)); + + for (i = 0; i < COLL_WEIGHTS_MAX; i++) { + avl_create(&substs[i], subst_compare, sizeof (subst_t), + offsetof(subst_t, avl)); + avl_create(&substs_ref[i], subst_compare_ref, + sizeof (subst_t), offsetof(subst_t, avl_ref)); + avl_create(&weights[i], weight_compare, sizeof (weight_t), + offsetof(weight_t, avl)); + nweight[i] = 1; + } + + (void) memset(&collinfo, 0, sizeof (collinfo)); + + /* allocate some initial priorities */ + pri_ignore = new_pri(); + + set_pri(pri_ignore, 0, RESOLVED); + + for (i = 0; i < COLL_WEIGHTS_MAX; i++) { + pri_undefined[i] = new_pri(); + + /* we will override this later */ + set_pri(pri_undefined[i], COLLATE_MAX_PRIORITY, UNKNOWN); + } +} + +void +define_collsym(char *name) +{ + collsym_t *sym; + avl_index_t where; + + if ((sym = calloc(sizeof (*sym), 1)) == NULL) { + fprintf(stderr,"out of memory"); + return; + } + sym->name = name; + sym->ref = new_pri(); + + if (avl_find(&collsyms, sym, &where) != NULL) { + /* + * This should never happen because we are only called + * for undefined symbols. + */ + INTERR; + return; + } + avl_insert(&collsyms, sym, where); +} + +collsym_t * +lookup_collsym(char *name) +{ + collsym_t srch; + + srch.name = name; + return (avl_find(&collsyms, &srch, NULL)); +} + +collelem_t * +lookup_collelem(char *symbol) +{ + collelem_t srch; + + srch.symbol = symbol; + return (avl_find(&elem_by_symbol, &srch, NULL)); +} + +static collundef_t * +get_collundef(char *name) +{ + collundef_t srch; + collundef_t *ud; + avl_index_t where; + int i; + + srch.name = name; + if ((ud = avl_find(&collundefs, &srch, &where)) == NULL) { + if (((ud = calloc(sizeof (*ud), 1)) == NULL) || + ((ud->name = strdup(name)) == NULL)) { + fprintf(stderr,"out of memory"); + return (NULL); + } + for (i = 0; i < NUM_WT; i++) { + ud->ref[i] = new_pri(); + } + avl_insert(&collundefs, ud, where); + } + add_charmap_undefined(name); + return (ud); +} + +static collchar_t * +get_collchar(wchar_t wc, int create) +{ + collchar_t srch; + collchar_t *cc; + avl_index_t where; + int i; + + srch.wc = wc; + cc = avl_find(&collchars, &srch, &where); + if ((cc == NULL) && create) { + if ((cc = calloc(sizeof (*cc), 1)) == NULL) { + fprintf(stderr, "out of memory"); + return (NULL); + } + for (i = 0; i < NUM_WT; i++) { + cc->ref[i] = new_pri(); + } + cc->wc = wc; + avl_insert(&collchars, cc, where); + } + return (cc); +} + +void +end_order_collsym(collsym_t *sym) +{ + start_order(T_COLLSYM); + /* update the weight */ + + set_pri(sym->ref, nextpri, RESOLVED); + nextpri++; +} + +void +end_order(void) +{ + int i; + int32_t pri; + int32_t ref; + collpri_t *p; + + /* advance the priority/weight */ + pri = nextpri; + + switch (currorder) { + case T_CHAR: + for (i = 0; i < NUM_WT; i++) { + if (((ref = order_weights[i]) < 0) || + ((p = get_pri(ref)) == NULL) || + (p->pri == -1)) { + /* unspecified weight is a self reference */ + set_pri(currchar->ref[i], pri, RESOLVED); + } else { + set_pri(currchar->ref[i], ref, REFER); + } + order_weights[i] = -1; + } + + /* leave a cookie trail in case next symbol is ellipsis */ + ellipsis_start = currchar->wc + 1; + currchar = NULL; + break; + + case T_ELLIPSIS: + /* save off the weights were we can find them */ + for (i = 0; i < NUM_WT; i++) { + ellipsis_weights[i] = order_weights[i]; + order_weights[i] = -1; + } + break; + + case T_COLLELEM: + if (currelem == NULL) { + INTERR; + } else { + for (i = 0; i < NUM_WT; i++) { + + if (((ref = order_weights[i]) < 0) || + ((p = get_pri(ref)) == NULL) || + (p->pri == -1)) { + set_pri(currelem->ref[i], pri, + RESOLVED); + } else { + set_pri(currelem->ref[i], ref, REFER); + } + order_weights[i] = -1; + } + } + break; + + case T_UNDEFINED: + for (i = 0; i < NUM_WT; i++) { + if (((ref = order_weights[i]) < 0) || + ((p = get_pri(ref)) == NULL) || + (p->pri == -1)) { + set_pri(pri_undefined[i], -1, RESOLVED); + } else { + set_pri(pri_undefined[i], ref, REFER); + } + order_weights[i] = -1; + } + break; + + case T_SYMBOL: + for (i = 0; i < NUM_WT; i++) { + if (((ref = order_weights[i]) < 0) || + ((p = get_pri(ref)) == NULL) || + (p->pri == -1)) { + set_pri(currundef->ref[i], pri, RESOLVED); + } else { + set_pri(currundef->ref[i], ref, REFER); + } + order_weights[i] = -1; + } + break; + + default: + INTERR; + } + + nextpri++; +} + +static void +start_order(int type) +{ + int i; + + lastorder = currorder; + currorder = type; + + /* this is used to protect ELLIPSIS processing */ + if ((lastorder == T_ELLIPSIS) && (type != T_CHAR)) { + fprintf(stderr, "character value expected"); + } + + for (i = 0; i < COLL_WEIGHTS_MAX; i++) { + order_weights[i] = -1; + } + curr_weight = 0; +} + +void +start_order_undefined(void) +{ + start_order(T_UNDEFINED); +} + +void +start_order_symbol(char *name) +{ + currundef = get_collundef(name); + start_order(T_SYMBOL); +} + +void +start_order_char(wchar_t wc) +{ + collchar_t *cc; + int32_t ref; + + start_order(T_CHAR); + + /* + * If we last saw an ellipsis, then we need to close the range. + * Handle that here. Note that we have to be careful because the + * items *inside* the range are treated exclusiveley to the items + * outside of the range. The ends of the range can have quite + * different weights than the range members. + */ + if (lastorder == T_ELLIPSIS) { + int i; + + if (wc < ellipsis_start) { + fprintf(stderr, "malformed range!"); + return; + } + while (ellipsis_start < wc) { + /* + * pick all of the saved weights for the + * ellipsis. note that -1 encodes for the + * ellipsis itself, which means to take the + * current relative priority. + */ + if ((cc = get_collchar(ellipsis_start, 1)) == NULL) { + INTERR; + return; + } + for (i = 0; i < NUM_WT; i++) { + collpri_t *p; + if (((ref = ellipsis_weights[i]) == -1) || + ((p = get_pri(ref)) == NULL) || + (p->pri == -1)) { + set_pri(cc->ref[i], nextpri, RESOLVED); + } else { + set_pri(cc->ref[i], ref, REFER); + } + ellipsis_weights[i] = 0; + } + ellipsis_start++; + nextpri++; + } + } + + currchar = get_collchar(wc, 1); +} + +void +start_order_collelem(collelem_t *e) +{ + start_order(T_COLLELEM); + currelem = e; +} + +void +start_order_ellipsis(void) +{ + int i; + + start_order(T_ELLIPSIS); + + if (lastorder != T_CHAR) { + fprintf(stderr, "illegal starting point for range"); + return; + } + + for (i = 0; i < NUM_WT; i++) { + ellipsis_weights[i] = order_weights[i]; + } +} + +void +define_collelem(char *name, wchar_t *wcs) +{ + collelem_t *e; + avl_index_t where1; + avl_index_t where2; + int i; + + if (wcslen(wcs) >= COLLATE_STR_LEN) { + fprintf(stderr,"expanded collation element too long"); + return; + } + + if ((e = calloc(sizeof (*e), 1)) == NULL) { + fprintf(stderr, "out of memory"); + return; + } + e->expand = wcs; + e->symbol = name; + + /* + * This is executed before the order statement, so we don't + * know how many priorities we *really* need. We allocate one + * for each possible weight. Not a big deal, as collating-elements + * prove to be quite rare. + */ + for (i = 0; i < COLL_WEIGHTS_MAX; i++) { + e->ref[i] = new_pri(); + } + + /* A character sequence can only reduce to one element. */ + if ((avl_find(&elem_by_symbol, e, &where1) != NULL) || + (avl_find(&elem_by_expand, e, &where2) != NULL)) { + fprintf(stderr, "duplicate collating element definition"); + return; + } + avl_insert(&elem_by_symbol, e, where1); + avl_insert(&elem_by_expand, e, where2); +} + +void +add_order_bit(int kw) +{ + uint8_t bit = DIRECTIVE_UNDEF; + + switch (kw) { + case T_FORWARD: + bit = DIRECTIVE_FORWARD; + break; + case T_BACKWARD: + bit = DIRECTIVE_BACKWARD; + break; + case T_POSITION: + bit = DIRECTIVE_POSITION; + break; + default: + INTERR; + break; + } + collinfo.directive[collinfo.directive_count] |= bit; +} + +void +add_order_directive(void) +{ + if (collinfo.directive_count >= COLL_WEIGHTS_MAX) { + fprintf(stderr,"too many directives (max %d)", COLL_WEIGHTS_MAX); + } + collinfo.directive_count++; +} + +static void +add_order_pri(int32_t ref) +{ + if (curr_weight >= NUM_WT) { + fprintf(stderr,"too many weights (max %d)", NUM_WT); + return; + } + order_weights[curr_weight] = ref; + curr_weight++; +} + +void +add_order_collsym(collsym_t *s) +{ + add_order_pri(s->ref); +} + +void +add_order_char(wchar_t wc) +{ + collchar_t *cc; + + if ((cc = get_collchar(wc, 1)) == NULL) { + INTERR; + return; + } + + add_order_pri(cc->ref[curr_weight]); +} + +void +add_order_collelem(collelem_t *e) +{ + add_order_pri(e->ref[curr_weight]); +} + +void +add_order_ignore(void) +{ + add_order_pri(pri_ignore); +} + +void +add_order_symbol(char *sym) +{ + collundef_t *c; + if ((c = get_collundef(sym)) == NULL) { + INTERR; + return; + } + add_order_pri(c->ref[curr_weight]); +} + +void +add_order_ellipsis(void) +{ + /* special NULL value indicates self reference */ + add_order_pri(0); +} + +void +add_order_subst(void) +{ + subst_t srch; + subst_t *s; + avl_index_t where; + int i; + + (void) memset(&srch, 0, sizeof (srch)); + for (i = 0; i < curr_subst; i++) { + srch.ref[i] = subst_weights[i]; + subst_weights[i] = 0; + } + s = avl_find(&substs_ref[curr_weight], &srch, &where); + + if (s == NULL) { + if ((s = calloc(sizeof (*s), 1)) == NULL) { + fprintf(stderr,"out of memory"); + return; + } + s->key = new_pri(); + + /* + * We use a self reference for our key, but we set a + * high bit to indicate that this is a substitution + * reference. This will expedite table lookups later, + * and prevent table lookups for situations that don't + * require it. (In short, its a big win, because we + * can skip a lot of binary searching.) + */ + set_pri(s->key, + (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY), + RESOLVED); + nextsubst[curr_weight] += 1; + + for (i = 0; i < curr_subst; i++) { + s->ref[i] = srch.ref[i]; + } + + avl_insert(&substs_ref[curr_weight], s, where); + + if (avl_find(&substs[curr_weight], s, &where) != NULL) { + INTERR; + return; + } + avl_insert(&substs[curr_weight], s, where); + } + curr_subst = 0; + + + /* + * We are using the current (unique) priority as a search key + * in the substitution table. + */ + add_order_pri(s->key); +} + +static void +add_subst_pri(int32_t ref) +{ + if (curr_subst >= COLLATE_STR_LEN) { + fprintf(stderr,"substitution string is too long"); + return; + } + subst_weights[curr_subst] = ref; + curr_subst++; +} + +void +add_subst_char(wchar_t wc) +{ + collchar_t *cc; + + + if (((cc = get_collchar(wc, 1)) == NULL) || + (cc->wc != wc)) { + INTERR; + return; + } + /* we take the weight for the character at that position */ + add_subst_pri(cc->ref[curr_weight]); +} + +void +add_subst_collelem(collelem_t *e) +{ + add_subst_pri(e->ref[curr_weight]); +} + +void +add_subst_collsym(collsym_t *s) +{ + add_subst_pri(s->ref); +} + +void +add_subst_symbol(char *ptr) +{ + collundef_t *cu; + + if ((cu = get_collundef(ptr)) != NULL) { + add_subst_pri(cu->ref[curr_weight]); + } +} + +void +add_weight(int32_t ref, int pass) +{ + weight_t srch; + weight_t *w; + avl_index_t where; + + srch.pri = resolve_pri(ref); + + /* No translation of ignores */ + if (srch.pri == 0) + return; + + /* Substitution priorities are not weights */ + if (srch.pri & COLLATE_SUBST_PRIORITY) + return; + + if (avl_find(&weights[pass], &srch, &where) != NULL) + return; + + if ((w = calloc(sizeof (*w), 1)) == NULL) { + fprintf(stderr, "out of memory"); + return; + } + w->pri = srch.pri; + avl_insert(&weights[pass], w, where); +} + +void +add_weights(int32_t *refs) +{ + int i; + for (i = 0; i < NUM_WT; i++) { + add_weight(refs[i], i); + } +} + +int32_t +get_weight(int32_t ref, int pass) +{ + weight_t srch; + weight_t *w; + int32_t pri; + + pri = resolve_pri(ref); + if (pri & COLLATE_SUBST_PRIORITY) { + return (pri); + } + if (pri <= 0) { + return (pri); + } + srch.pri = pri; + if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) { + INTERR; + return (-1); + } + return (w->opt); +} + +wchar_t * +wsncpy(wchar_t *s1, const wchar_t *s2, size_t n) +{ + wchar_t *os1 = s1; + + n++; + while (--n > 0 && (*s1++ = *s2++) != 0) + continue; + if (n > 0) + while (--n > 0) + *s1++ = 0; + return (os1); +} + +void +dump_collate(void) +{ + FILE *f; + int i, j, n; + size_t sz; + int32_t pri; + collelem_t *ce; + collchar_t *cc; + subst_t *sb; + char vers[COLLATE_STR_LEN]; + collate_char_t chars[UCHAR_MAX + 1]; + collate_large_t *large; + collate_subst_t *subst[COLL_WEIGHTS_MAX]; + collate_chain_t *chain; + + /* + * We have to run throught a preliminary pass to identify all the + * weights that we use for each sorting level. + */ + for (i = 0; i < NUM_WT; i++) { + add_weight(pri_ignore, i); + } + for (i = 0; i < NUM_WT; i++) { + for (sb = avl_first(&substs[i]); sb; + sb = AVL_NEXT(&substs[i], sb)) { + for (j = 0; sb->ref[j]; j++) { + add_weight(sb->ref[j], i); + } + } + } + for (ce = avl_first(&elem_by_expand); + ce != NULL; + ce = AVL_NEXT(&elem_by_expand, ce)) { + add_weights(ce->ref); + } + for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) { + add_weights(cc->ref); + } + + /* + * Now we walk the entire set of weights, removing the gaps + * in the weights. This gives us optimum usage. The walk + * occurs in priority. + */ + for (i = 0; i < NUM_WT; i++) { + weight_t *w; + for (w = avl_first(&weights[i]); w; + w = AVL_NEXT(&weights[i], w)) { + w->opt = nweight[i]; + nweight[i] += 1; + } + } + + (void) memset(&chars, 0, sizeof (chars)); + (void) memset(vers, 0, COLLATE_STR_LEN); + (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers)); + + /* + * We need to make sure we arrange for the UNDEFINED field + * to show up. Also, set the total weight counts. + */ + for (i = 0; i < NUM_WT; i++) { + if (resolve_pri(pri_undefined[i]) == -1) { + set_pri(pri_undefined[i], -1, RESOLVED); + /* they collate at the end of everything else */ + collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY; + } + collinfo.pri_count[i] = nweight[i]; + } + + collinfo.pri_count[NUM_WT] = max_wide(); + collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY; + collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED; + + /* + * Ordinary character priorities + */ + for (i = 0; i <= UCHAR_MAX; i++) { + if ((cc = get_collchar(i, 0)) != NULL) { + for (j = 0; j < NUM_WT; j++) { + chars[i].pri[j] = get_weight(cc->ref[j], j); + } + } else { + for (j = 0; j < NUM_WT; j++) { + chars[i].pri[j] = + get_weight(pri_undefined[j], j); + } + /* + * Per POSIX, for undefined characters, we + * also have to add a last item, which is the + * character code. + */ + chars[i].pri[NUM_WT] = i; + } + } + + /* + * Substitution tables + */ + for (i = 0; i < NUM_WT; i++) { + collate_subst_t *st = NULL; + n = collinfo.subst_count[i] = avl_numnodes(&substs[i]); + if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) { + fprintf(stderr, "out of memory"); + return; + } + n = 0; + for (sb = avl_first(&substs[i]); sb; + sb = AVL_NEXT(&substs[i], sb)) { + if ((st[n].key = resolve_pri(sb->key)) < 0) { + /* by definition these resolve! */ + INTERR; + } + if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) { + INTERR; + } + for (j = 0; sb->ref[j]; j++) { + st[n].pri[j] = get_weight(sb->ref[j], i); + } + n++; + } + if (n != collinfo.subst_count[i]) + INTERR; + subst[i] = st; + } + + + /* + * Chains, i.e. collating elements + */ + collinfo.chain_count = avl_numnodes(&elem_by_expand); + chain = calloc(sizeof (collate_chain_t), collinfo.chain_count); + if (chain == NULL) { + fprintf(stderr, "out of memory"); + return; + } + for (n = 0, ce = avl_first(&elem_by_expand); + ce != NULL; + ce = AVL_NEXT(&elem_by_expand, ce), n++) { + (void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN); + for (i = 0; i < NUM_WT; i++) { + chain[n].pri[i] = get_weight(ce->ref[i], i); + } + } + if (n != collinfo.chain_count) + INTERR; + + /* + * Large (> UCHAR_MAX) character priorities + */ + large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1); + if (large == NULL) { + fprintf(stderr, "out of memory"); + return; + } + + i = 0; + for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) { + int undef = 0; + /* we already gathered those */ + if (cc->wc <= UCHAR_MAX) + continue; + for (j = 0; j < NUM_WT; j++) { + if ((pri = get_weight(cc->ref[j], j)) < 0) { + undef = 1; + } + if (undef && (pri >= 0)) { + /* if undefined, then all priorities are */ + INTERR; + } else { + large[i].pri.pri[j] = pri; + } + } + if (!undef) { + large[i].val = cc->wc; + collinfo.large_count = i++; + } + } + + if ((f = open_category()) == NULL) { + return; + } + + /* Time to write the entire data set out */ + + if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) || + (wr_category(&collinfo, sizeof (collinfo), f) < 0) || + (wr_category(&chars, sizeof (chars), f) < 0)) { + return; + } + + for (i = 0; i < NUM_WT; i++) { + sz = sizeof (collate_subst_t) * collinfo.subst_count[i]; + if (wr_category(subst[i], sz, f) < 0) { + return; + } + } + sz = sizeof (collate_chain_t) * collinfo.chain_count; + if (wr_category(chain, sz, f) < 0) { + return; + } + sz = sizeof (collate_large_t) * collinfo.large_count; + if (wr_category(large, sz, f) < 0) { + return; + } + + close_category(f); +} diff --git a/usr.bin/localedef/ctype.c b/usr.bin/localedef/ctype.c new file mode 100644 index 000000000000..8cb68135b4ee --- /dev/null +++ b/usr.bin/localedef/ctype.c @@ -0,0 +1,464 @@ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 Garrett D'Amore All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * LC_CTYPE database generation routines for localedef. + */ +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "localedef.h" +#include "parser.h" +#include "runefile.h" + + +#define _ISUPPER _CTYPE_U +#define _ISLOWER _CTYPE_L +#define _ISDIGIT _CTYPE_D +#define _ISXDIGIT _CTYPE_X +#define _ISSPACE _CTYPE_S +#define _ISBLANK _CTYPE_B +#define _ISALPHA _CTYPE_A +#define _ISPUNCT _CTYPE_P +#define _ISGRAPH _CTYPE_G +#define _ISPRINT _CTYPE_R +#define _ISCNTRL _CTYPE_C +#define _E1 _CTYPE_Q +#define _E2 _CTYPE_I +#define _E3 0 +#define _E4 0 +#define _E5 _CTYPE_T + +static avl_tree_t ctypes; + +static wchar_t last_ctype; + +typedef struct ctype_node { + wchar_t wc; + int32_t ctype; + int32_t toupper; + int32_t tolower; + avl_node_t avl; +} ctype_node_t; + +typedef struct width_node { + wchar_t start; + wchar_t end; + int8_t width; + avl_node_t avl; +} width_node_t; + +static int +ctype_compare(const void *n1, const void *n2) +{ + const ctype_node_t *c1 = n1; + const ctype_node_t *c2 = n2; + + return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0); +} + +void +init_ctype(void) +{ + avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t), + offsetof(ctype_node_t, avl)); +} + + +static void +add_ctype_impl(ctype_node_t *ctn) +{ + switch (last_kw) { + case T_ISUPPER: + ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT); + break; + case T_ISLOWER: + ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT); + break; + case T_ISALPHA: + ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT); + break; + case T_ISDIGIT: + ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT); + break; + case T_ISSPACE: + ctn->ctype |= _ISSPACE; + break; + case T_ISCNTRL: + ctn->ctype |= _ISCNTRL; + break; + case T_ISGRAPH: + ctn->ctype |= (_ISGRAPH | _ISPRINT); + break; + case T_ISPRINT: + ctn->ctype |= _ISPRINT; + break; + case T_ISPUNCT: + ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT); + break; + case T_ISXDIGIT: + ctn->ctype |= (_ISXDIGIT | _ISPRINT); + break; + case T_ISBLANK: + ctn->ctype |= (_ISBLANK | _ISSPACE); + break; + case T_ISPHONOGRAM: + ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH); + break; + case T_ISIDEOGRAM: + ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH); + break; + case T_ISENGLISH: + ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH); + break; + case T_ISNUMBER: + ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH); + break; + case T_ISSPECIAL: + ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH); + break; + case T_ISALNUM: + /* + * We can't do anything with this. The character + * should already be specified as a digit or alpha. + */ + break; + default: + errf("not a valid character class"); + } +} + +static ctype_node_t * +get_ctype(wchar_t wc) +{ + ctype_node_t srch; + ctype_node_t *ctn; + avl_index_t where; + + srch.wc = wc; + if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) { + if ((ctn = calloc(1, sizeof (*ctn))) == NULL) { + errf("out of memory"); + return (NULL); + } + ctn->wc = wc; + + avl_insert(&ctypes, ctn, where); + } + return (ctn); +} + +void +add_ctype(int val) +{ + ctype_node_t *ctn; + + if ((ctn = get_ctype(val)) == NULL) { + INTERR; + return; + } + add_ctype_impl(ctn); + last_ctype = ctn->wc; +} + +void +add_ctype_range(int end) +{ + ctype_node_t *ctn; + wchar_t cur; + + if (end < last_ctype) { + errf("malformed character range (%u ... %u))", + last_ctype, end); + return; + } + for (cur = last_ctype + 1; cur <= end; cur++) { + if ((ctn = get_ctype(cur)) == NULL) { + INTERR; + return; + } + add_ctype_impl(ctn); + } + last_ctype = end; + +} + +/* + * A word about widths: if the width mask is specified, then libc + * unconditionally honors it. Otherwise, it assumes printable + * characters have width 1, and non-printable characters have width + * -1 (except for NULL which is special with with 0). Hence, we have + * no need to inject defaults here -- the "default" unset value of 0 + * indicates that libc should use its own logic in wcwidth as described. + */ +void +add_width(int wc, int width) +{ + ctype_node_t *ctn; + + if ((ctn = get_ctype(wc)) == NULL) { + INTERR; + return; + } + ctn->ctype &= ~(_CTYPE_SWM); + switch (width) { + case 0: + ctn->ctype |= _CTYPE_SW0; + break; + case 1: + ctn->ctype |= _CTYPE_SW1; + break; + case 2: + ctn->ctype |= _CTYPE_SW2; + break; + case 3: + ctn->ctype |= _CTYPE_SW3; + break; + } +} + +void +add_width_range(int start, int end, int width) +{ + for (; start <= end; start++) { + add_width(start, width); + } +} + +void +add_caseconv(int val, int wc) +{ + ctype_node_t *ctn; + + ctn = get_ctype(val); + if (ctn == NULL) { + INTERR; + return; + } + + switch (last_kw) { + case T_TOUPPER: + ctn->toupper = wc; + break; + case T_TOLOWER: + ctn->tolower = wc; + break; + default: + INTERR; + break; + } +} + +void +dump_ctype(void) +{ + FILE *f; + _FileRuneLocale rl; + ctype_node_t *ctn, *last_ct, *last_lo, *last_up; + _FileRuneEntry *ct = NULL; + _FileRuneEntry *lo = NULL; + _FileRuneEntry *up = NULL; + wchar_t wc; + + (void) memset(&rl, 0, sizeof (rl)); + last_ct = NULL; + last_lo = NULL; + last_up = NULL; + + if ((f = open_category()) == NULL) + return; + + (void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8); + (void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding)); + + /* + * Initialize the identity map. + */ + for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) { + rl.maplower[wc] = wc; + rl.mapupper[wc] = wc; + } + + for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) { + int conflict = 0; + + + wc = ctn->wc; + + /* + * POSIX requires certain portable characters have + * certain types. Add them if they are missing. + */ + if ((wc >= 1) && (wc <= 127)) { + if ((wc >= 'A') && (wc <= 'Z')) + ctn->ctype |= _ISUPPER; + if ((wc >= 'a') && (wc <= 'z')) + ctn->ctype |= _ISLOWER; + if ((wc >= '0') && (wc <= '9')) + ctn->ctype |= _ISDIGIT; + if (strchr(" \f\n\r\t\v", (char)wc) != NULL) + ctn->ctype |= _ISSPACE; + if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL) + ctn->ctype |= _ISXDIGIT; + if (strchr(" \t", (char)wc)) + ctn->ctype |= _ISBLANK; + + /* + * Technically these settings are only + * required for the C locale. However, it + * turns out that because of the historical + * version of isprint(), we need them for all + * locales as well. Note that these are not + * necessarily valid punctation characters in + * the current language, but ispunct() needs + * to return TRUE for them. + */ + if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~", + (char)wc)) + ctn->ctype |= _ISPUNCT; + } + + /* + * POSIX also requires that certain types imply + * others. Add any inferred types here. + */ + if (ctn->ctype & (_ISUPPER |_ISLOWER)) + ctn->ctype |= _ISALPHA; + if (ctn->ctype & _ISDIGIT) + ctn->ctype |= _ISXDIGIT; + if (ctn->ctype & _ISBLANK) + ctn->ctype |= _ISSPACE; + if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT)) + ctn->ctype |= _ISGRAPH; + if (ctn->ctype & _ISGRAPH) + ctn->ctype |= _ISPRINT; + + /* + * Finally, POSIX requires that certain combinations + * are invalid. We don't flag this as a fatal error, + * but we will warn about. + */ + if ((ctn->ctype & _ISALPHA) && + (ctn->ctype & (_ISPUNCT|_ISDIGIT))) + conflict++; + if ((ctn->ctype & _ISPUNCT) & + (ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT))) + conflict++; + if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH)) + conflict++; + if ((ctn->ctype & _ISCNTRL) & _ISPRINT) + conflict++; + if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH))) + conflict++; + + if (conflict) { + warn("conflicting classes for character 0x%x (%x)", + wc, ctn->ctype); + } + /* + * Handle the lower 256 characters using the simple + * optimization. Note that if we have not defined the + * upper/lower case, then we identity map it. + */ + if ((unsigned)wc < _CACHED_RUNES) { + rl.runetype[wc] = ctn->ctype; + if (ctn->tolower) + rl.maplower[wc] = ctn->tolower; + if (ctn->toupper) + rl.mapupper[wc] = ctn->toupper; + continue; + } + + if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) { + ct[rl.runetype_ext_nranges-1].max = wc; + last_ct = ctn; + } else { + rl.runetype_ext_nranges++; + ct = realloc(ct, + sizeof (*ct) * rl.runetype_ext_nranges); + ct[rl.runetype_ext_nranges - 1].min = wc; + ct[rl.runetype_ext_nranges - 1].max = wc; + ct[rl.runetype_ext_nranges - 1].map = ctn->ctype; + last_ct = ctn; + } + if (ctn->tolower == 0) { + last_lo = NULL; + } else if ((last_lo != NULL) && + (last_lo->tolower + 1 == ctn->tolower)) { + lo[rl.maplower_ext_nranges-1].max = wc; + last_lo = ctn; + } else { + rl.maplower_ext_nranges++; + lo = realloc(lo, + sizeof (*lo) * rl.maplower_ext_nranges); + lo[rl.maplower_ext_nranges - 1].min = wc; + lo[rl.maplower_ext_nranges - 1].max = wc; + lo[rl.maplower_ext_nranges - 1].map = ctn->tolower; + last_lo = ctn; + } + + if (ctn->toupper == 0) { + last_up = NULL; + } else if ((last_up != NULL) && + (last_up->toupper + 1 == ctn->toupper)) { + up[rl.mapupper_ext_nranges-1].max = wc; + last_up = ctn; + } else { + rl.mapupper_ext_nranges++; + up = realloc(up, + sizeof (*up) * rl.mapupper_ext_nranges); + up[rl.mapupper_ext_nranges - 1].min = wc; + up[rl.mapupper_ext_nranges - 1].max = wc; + up[rl.mapupper_ext_nranges - 1].map = ctn->toupper; + last_up = ctn; + } + } + + if ((wr_category(&rl, sizeof (rl), f) < 0) || + (wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) || + (wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) || + (wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) { + return; + } + + close_category(f); +} diff --git a/usr.bin/localedef/localedef.1 b/usr.bin/localedef/localedef.1 new file mode 100644 index 000000000000..7cd026d78437 --- /dev/null +++ b/usr.bin/localedef/localedef.1 @@ -0,0 +1,238 @@ +.\" Copyright (c) 1992, X/Open Company Limited All Rights Reserved +.\" Portions Copyright (c) 2003, Sun Microsystems, Inc. All Rights Reserved +.\" Portions Copyright 2013 DEY Storage Systems, Inc. +.\" Sun Microsystems, Inc. gratefully acknowledges The Open Group for +.\" permission to reproduce portions of its copyrighted documentation. +.\" Original documentation from The Open Group can be obtained online at +.\" http://www.opengroup.org/bookstore/. +.\" The Institute of Electrical and Electronics Engineers and The Open Group, +.\" have given us permission to reprint portions of their documentation. In +.\" the following statement, the phrase "this text" refers to portions of the +.\" system documentation. Portions of this text are reprinted and reproduced +.\" in electronic form in the Sun OS Reference Manual, from IEEE Std 1003.1, +.\" 2004 Edition, Standard for Information Technology -- Portable Operating +.\" System Interface (POSIX), The Open Group Base Specifications Issue 6, +.\" Copyright (C) 2001-2004 by the Institute of Electrical and Electronics +.\" Engineers, Inc and The Open Group. In the event of any discrepancy between +.\" these versions and the original IEEE and The Open Group Standard, the +.\" original IEEE and The Open Group Standard is the referee document. The +.\" original Standard can be obtained online at +.\" http://www.opengroup.org/unix/online.html. +.\" This notice shall appear on any product containing this material. +.\" The contents of this file are subject to the terms of the Common +.\" Development and Distribution License (the "License"). You may not use +.\" this file except in compliance with the License. +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or +.\" http://www.opensolaris.org/os/licensing. See the License for the specific +.\" language governing permissions and limitations under the License. +.\" When distributing Covered Code, include this CDDL HEADER in each file and +.\" include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, +.\" add the following below this CDDL HEADER, with the fields enclosed by +.\" brackets "[]" replaced with your own identifying information: +.\" Portions Copyright [yyyy] [name of copyright owner] +.Dd July 28, 2015 +.Dt LOCALEDEF 1 +.Os +.Sh NAME +.Nm localedef +.Nd define locale environment +.Sh SYNOPSIS +.Nm +.Op Fl D +.Op Fl c +.Op Fl v +.Op Fl U +.Op Fl f Ar charmap +.Op Fl w Ar widthfile +.Op Fl i Ar sourcefile +.Op Fl u Ar codeset +localename +.Sh DESCRIPTION +The +.Nm localedef +utility converts source definitions for locale categories +into a format usable by the functions and utilities whose operational behavior +is determined by the setting of the locale environment variables; see +.Xr environ 5 . +.Pp +The utility reads source definitions for one or more locale categories +belonging to the same locale from the file named in the \fB-i\fR option (if +specified) or from standard input. +.Pp +Each category source definition is identified by the corresponding environment +variable name and terminated by an +.Sy END +.Em category-name +statement. The following categories are supported: +.Bl -tag -width LC_MONETARY +.It LC_CTYPE +Defines character classification and case conversion. +.It LC_COLLATE +Defines collation rules. +.It LC_MONETARY +Defines the format and symbols used in formatting of monetary information. +.It LC_NUMERIC +Defines the decimal delimiter, grouping and grouping symbol for non-monetary +numeric editing. +.It LC_TIME +Defines the format and content of date and time information. +.It LC_MESSAGES +Defines the format and values of affirmative and negative responses. +.El +.Pp +The following options are supported: +.Bl -tag -width xx_sourcefile +.It -D +BSD-style output. Rather than the default of creating the +.Sy localename +directory and creating files like LC_CTYPE, LC_COLLATE, etc, in that directory, +the output files have the format "." and are +dumped to the current directory. +.It -c +Creates permanent output even if warning messages have been issued. +.It -v +Emit verbose debugging output on standard output. +.It -U +Ignore the presence of character symbols that have no matching character +definition. This facilitates the use of a common locale definition file +to be used across multiple encodings, even when some symbols are not +present in a given encoding. +.It -f charmap +Specifies the pathname of a file containing a mapping of character symbols and +collating element symbols to actual character encodings. This option must be +specified if symbolic names (other than collating symbols defined in a +.Sy collating-symbol +keyword) are used. If the +.Sy -f +option is not present, the default character mapping will be used. +.It -w widthfile +The path name of the file containing character screen width definitions. +If not supplied, then default screen widths will be assumed, which will +generally not account for East Asian encodings requiring more than a single +character cell to display, nor for combining or accent marks that occupy +no additional screen width. +.It -i sourcefile +The path name of a file containing the source definitions. If this option is +not present, source definitions will be read from standard input. +.It -u codeset +Specifies the name of a codeset used as the target mapping of character symbols +and collating element symbols whose encoding values are defined in terms of the +ISO/IEC 10646-1: 2000 standard position constant values. See NOTES. +.El +.Pp +The following operands are required: +.Bl -tag -width localename +.It localename +Identifies the locale. If the name contains one or more slash characters, +.Ar localename +will be interpreted as a path name where the created locale +definitions will be stored. This capability may be restricted to users with +appropriate privileges. (As a consequence of specifying one +.Ar localename , +although several categories can be processed in one execution, only categories +belonging to the same locale can be processed.) +.El +.Sh OUTPUT +.Nm +creates a directory of files that represents the locale's data, unless instructed +otherwise by the +.Sy -D +(BSD output) option. The contants of this directory should generally be +copied into the appropriate subdirectory of /usr/share/locale in order the +definitions to be visible to programs linked with libc. +.Sh ENVIRONMENT +See +.Xr Benviron 5 +for definitions of the following environment variables that affect the execution of +.Nm : +.Sy LANG , +.Sy LC_ALL , +.Sy LC_COLLATE , +.Sy LC_CTYPE , +.Sy LC_MESSAGES , +.Sy LC_MONETARY , +.Sy LC_MUMERIC , +.Sy LC_TIME , +and +.Sy NLSPATH . +.Sh EXIT STATUS +The following exit values are returned: +.Bl -tag -width XX +.It 0 +No errors occurred and the locales were successfully created. +.It 1 +Warnings occurred and the locales were successfully created. +.It 2 +The locale specification exceeded implementation limits or the coded character +set or sets used were not supported by the implementation, and no locale was +created. +.It >3 +Warnings or errors occurred and no output was created. +.El +.Pp +If an error is detected, no permanent output will be created. +.Sh SEE ALSO +.Xr locale 1 , +.Xr iconv_open 3 , +.Xr nl_langinfo 3 , +.Xr strftime 3 , +.Xr environ 5 . +.Sh WARNINGS +If warnings occur, permanent output will be created if the +.Sy -c +option was specified. The following conditions will cause warning messages to be issued: +.Bl -tag -width X +.It * +If a symbolic name not found in the +.Em charmap +file is used for the descriptions of the +.Sy LC_CTYPE +or +.Sy LC_COLLATE +categories (for other categories, this will be an error condition). +.It * +If optional keywords not supported by the implementation are present in the +source. +.El +.Sh NOTES +When the +.Sy -u +option is used, the +.Em codeset +option-argument is interpreted as a name of a codeset to which the +ISO/IEC 10646-1: 2000 standard position constant values are converted. Both the +ISO/IEC 10646-1: 2000 standard position constant values and other formats (decimal, +hexadecimal, or octal) are valid as encoding values within the charmap file. The +codeset can be any codeset that is supported by the \fBiconv_open\fR(3C) function +on the system. +.Pp +When conflicts occur between the charmap specification of +.Em codeset , +.Em mb_cur_max , +or +.Em mb_cur_min +and the corresponding value for the codeset represented by the +.Sy -u +option-argument +.Em codeset , +the +.Nm +utility fails as an error. +.Pp +When conflicts occur between the charmap encoding values specified for symbolic +names of characters of the portable character set and the character encoding +values defined by the US-ASCII, the result is unspecified. +.Sh HISTORY +.Nm +first appeared in +.Dx +4.4. It was ported from Illumos from the point +.An Garrett D'Amore +.Aq garrett@nexenta.com +added multibyte support (October 2010). +.An John Marino +.Aq draco@marino.st +provided the alternations necessary to compile cleanly on +.Dx +as well as altered libc to use the new collation (the changes were also based +on Illumos, but modified to work with xlocale functionality.) diff --git a/usr.bin/localedef/localedef.c b/usr.bin/localedef/localedef.c new file mode 100644 index 000000000000..afc2e240860e --- /dev/null +++ b/usr.bin/localedef/localedef.c @@ -0,0 +1,346 @@ +/* + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * POSIX localedef. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "localedef.h" +#include "parser.h" + +#ifndef TEXT_DOMAIN +#define TEXT_DOMAIN "SYS_TEST" +#endif + +int bsd = 0; +int verbose = 0; +int undefok = 0; +int warnok = 0; +static char *locname = NULL; +static char locpath[PATH_MAX]; + +const char * +category_name(void) +{ + switch (get_category()) { + case T_CHARMAP: + return ("CHARMAP"); + case T_WIDTH: + return ("WIDTH"); + case T_COLLATE: + return ("LC_COLLATE"); + case T_CTYPE: + return ("LC_CTYPE"); + case T_MESSAGES: + return ("LC_MESSAGES"); + case T_MONETARY: + return ("LC_MONETARY"); + case T_NUMERIC: + return ("LC_NUMERIC"); + case T_TIME: + return ("LC_TIME"); + default: + INTERR; + return (NULL); + } +} + +static char * +category_file(void) +{ + if (bsd) + (void) snprintf(locpath, sizeof (locpath), "%s.%s", + locname, category_name()); + else + (void) snprintf(locpath, sizeof (locpath), "%s/%s", + locname, category_name()); + return (locpath); +} + +FILE * +open_category(void) +{ + FILE *file; + + if (verbose) { + (void) printf("Writing category %s: ", category_name()); + (void) fflush(stdout); + } + + /* make the parent directory */ + if (!bsd) + (void) mkdir(dirname(category_file()), 0755); + + /* + * note that we have to regenerate the file name, as dirname + * clobbered it. + */ + file = fopen(category_file(), "w"); + if (file == NULL) { + errf(strerror(errno)); + return (NULL); + } + return (file); +} + +void +close_category(FILE *f) +{ + if (fchmod(fileno(f), 0644) < 0) { + (void) fclose(f); + (void) unlink(category_file()); + errf(strerror(errno)); + } + if (fclose(f) < 0) { + (void) unlink(category_file()); + errf(strerror(errno)); + } + if (verbose) { + (void) fprintf(stdout, "done.\n"); + (void) fflush(stdout); + } +} + +/* + * This function is used when copying the category from another + * locale. Note that the copy is actually performed using a hard + * link for efficiency. + */ +void +copy_category(char *src) +{ + char srcpath[PATH_MAX]; + int rv; + + (void) snprintf(srcpath, sizeof (srcpath), "%s/%s", + src, category_name()); + rv = access(srcpath, R_OK); + if ((rv != 0) && (strchr(srcpath, '/') == NULL)) { + /* Maybe we should try the system locale */ + (void) snprintf(srcpath, sizeof (srcpath), + "/usr/lib/locale/%s/%s", src, category_name()); + rv = access(srcpath, R_OK); + } + + if (rv != 0) { + fprintf(stderr,"source locale data unavailable: %s", src); + return; + } + + if (verbose > 1) { + (void) printf("Copying category %s from %s: ", + category_name(), src); + (void) fflush(stdout); + } + + /* make the parent directory */ + if (!bsd) + (void) mkdir(dirname(category_file()), 0755); + + if (link(srcpath, category_file()) != 0) { + fprintf(stderr,"unable to copy locale data: %s", + strerror(errno)); + return; + } + if (verbose > 1) { + (void) printf("done.\n"); + } +} + +int +putl_category(const char *s, FILE *f) +{ + if (s && fputs(s, f) == EOF) { + (void) fclose(f); + (void) unlink(category_file()); + errf(strerror(errno)); + return (EOF); + } + if (fputc('\n', f) == EOF) { + (void) fclose(f); + (void) unlink(category_file()); + errf(strerror(errno)); + return (EOF); + } + return (0); +} + +int +wr_category(void *buf, size_t sz, FILE *f) +{ + if (!sz) { + return (0); + } + if (fwrite(buf, sz, 1, f) < 1) { + (void) fclose(f); + (void) unlink(category_file()); + errf(strerror(errno)); + return (EOF); + } + return (0); +} + +int yyparse(void); + +static void +usage(void) +{ + (void) fprintf(stderr, "Usage: localedef [options] localename\n"); + (void) fprintf(stderr, "[options] are:\n"); + (void) fprintf(stderr, " -D : BSD-style output\n"); + (void) fprintf(stderr, " -c : ignore warnings\n"); + (void) fprintf(stderr, " -v : verbose output\n"); + (void) fprintf(stderr, " -U : ignore undefined symbols\n"); + (void) fprintf(stderr, " -f charmap : use given charmap file\n"); + (void) fprintf(stderr, " -u encoding : assume encoding\n"); + (void) fprintf(stderr, " -w widths : use screen widths file\n"); + (void) fprintf(stderr, " -i locsrc : source file for locale\n"); + exit(4); +} + +int +main(int argc, char **argv) +{ + int c; + char *lfname = NULL; + char *cfname = NULL; + char *wfname = NULL; + DIR *dir; + + init_charmap(); + init_collate(); + init_ctype(); + init_messages(); + init_monetary(); + init_numeric(); + init_time(); + + yydebug = 0; + + (void) setlocale(LC_ALL, ""); + + while ((c = getopt(argc, argv, "w:i:cf:u:vUD")) != -1) { + switch (c) { + case 'D': + bsd = 1; + break; + case 'v': + verbose++; + break; + case 'i': + lfname = optarg; + break; + case 'u': + set_wide_encoding(optarg); + break; + case 'f': + cfname = optarg; + break; + case 'U': + undefok++; + break; + case 'c': + warnok++; + break; + case 'w': + wfname = optarg; + break; + case '?': + usage(); + break; + } + } + + if ((argc - 1) != (optind)) { + usage(); + } + locname = argv[argc - 1]; + if (verbose) { + (void) printf("Processing locale %s.\n", locname); + } + + if (cfname) { + if (verbose) + (void) printf("Loading charmap %s.\n", cfname); + reset_scanner(cfname); + (void) yyparse(); + } + + if (wfname) { + if (verbose) + (void) printf("Loading widths %s.\n", wfname); + reset_scanner(wfname); + (void) yyparse(); + } + + if (verbose) { + (void) printf("Loading POSIX portable characters.\n"); + } + add_charmap_posix(); + + if (lfname) { + reset_scanner(lfname); + } else { + reset_scanner(NULL); + } + + /* make the directory for the locale if not already present */ + if (!bsd) { + while ((dir = opendir(locname)) == NULL) { + if ((errno != ENOENT) || + (mkdir(locname, 0755) < 0)) { + errf(strerror(errno)); + } + } + (void) closedir(dir); + (void) mkdir(dirname(category_file()), 0755); + } + + (void) yyparse(); + if (verbose) { + (void) printf("All done.\n"); + } + return (warnings ? 1 : 0); +} diff --git a/usr.bin/localedef/localedef.h b/usr.bin/localedef/localedef.h new file mode 100644 index 000000000000..91e47ad62215 --- /dev/null +++ b/usr.bin/localedef/localedef.h @@ -0,0 +1,172 @@ +/* + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * POSIX localedef. + */ + +/* Common header files. */ +#include +#include +#include +#include + +extern int com_char; +extern int esc_char; +extern int mb_cur_max; +extern int mb_cur_min; +extern int last_kw; +extern int verbose; +extern int yydebug; +extern int lineno; +extern int undefok; /* mostly ignore undefined symbols */ +extern int warnok; +extern int warnings; + +int yylex(void); +void yyerror(const char *); +void errf(const char *, ...); +void warn(const char *, ...); + +int putl_category(const char *, FILE *); +int wr_category(void *, size_t, FILE *); +FILE *open_category(void); +void close_category(FILE *); +void copy_category(char *); +const char *category_name(void); + +int get_category(void); +int get_symbol(void); +int get_escaped(int); +int get_wide(void); +void reset_scanner(const char *); +void scan_to_eol(void); +void add_wcs(wchar_t); +void add_tok(int); +wchar_t *get_wcs(void); + +/* charmap.c - CHARMAP handling */ +void init_charmap(void); +void add_charmap(char *, int); +void add_charmap_undefined(char *); +void add_charmap_posix(void); +void add_charmap_range(char *, char *, int); +void add_charmap_char(char *name, int val); +int lookup_charmap(const char *, wchar_t *); +int check_charmap_undefined(char *); +int check_charmap(wchar_t); + +/* collate.o - LC_COLLATE handling */ +typedef struct collelem collelem_t; +typedef struct collsym collsym_t; +void init_collate(void); +void define_collsym(char *); +void define_collelem(char *, wchar_t *); +void add_order_directive(void); +void add_order_bit(int); +void dump_collate(void); +collsym_t *lookup_collsym(char *); +collelem_t *lookup_collelem(char *); +void start_order_collelem(collelem_t *); +void start_order_undefined(void); +void start_order_symbol(char *); +void start_order_char(wchar_t); +void start_order_ellipsis(void); +void end_order_collsym(collsym_t *); +void end_order(void); +void add_weight(int32_t, int); +void add_weights(int32_t *); +void add_weight_num(int); +void add_order_collelem(collelem_t *); +void add_order_collsym(collsym_t *); +void add_order_char(wchar_t); +void add_order_ignore(void); +void add_order_ellipsis(void); +void add_order_symbol(char *); +void add_order_subst(void); +void add_subst_char(wchar_t); +void add_subst_collsym(collsym_t *); +void add_subst_collelem(collelem_t *); +void add_subst_symbol(char *); +int32_t get_weight(int32_t, int); +wchar_t * wsncpy(wchar_t *, const wchar_t *, size_t); + + +/* ctype.c - LC_CTYPE handling */ +void init_ctype(void); +void add_ctype(int); +void add_ctype_range(int); +void add_width(int, int); +void add_width_range(int, int, int); +void add_caseconv(int, int); +void dump_ctype(void); + +/* messages.c - LC_MESSAGES handling */ +void init_messages(void); +void add_message(wchar_t *); +void dump_messages(void); + +/* monetary.c - LC_MONETARY handling */ +void init_monetary(void); +void add_monetary_str(wchar_t *); +void add_monetary_num(int); +void reset_monetary_group(void); +void add_monetary_group(int); +void dump_monetary(void); + +/* numeric.c - LC_NUMERIC handling */ +void init_numeric(void); +void add_numeric_str(wchar_t *); +void reset_numeric_group(void); +void add_numeric_group(int); +void dump_numeric(void); + +/* time.c - LC_TIME handling */ +void init_time(void); +void add_time_str(wchar_t *); +void reset_time_list(void); +void add_time_list(wchar_t *); +void check_time_list(void); +void dump_time(void); + +/* wide.c - Wide character handling. */ +int to_wide(wchar_t *, const char *); +int to_mbs(char *, wchar_t); +int to_mb(char *, wchar_t); +char *to_mb_string(const wchar_t *); +void set_wide_encoding(const char *); +void werr(const char *, ...); +const char *get_wide_encoding(void); +int max_wide(void); + +//#define _(x) gettext(x) +#define INTERR fprintf(stderr,"internal fault (%s:%d)", __FILE__, __LINE__) diff --git a/usr.bin/localedef/messages.c b/usr.bin/localedef/messages.c new file mode 100644 index 000000000000..0502eb5c3c1e --- /dev/null +++ b/usr.bin/localedef/messages.c @@ -0,0 +1,121 @@ +/* + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * LC_MESSAGES database generation routines for localedef. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include "localedef.h" +#include "parser.h" +#include "lmessages.h" + +static struct lc_messages_T msgs; + +void +init_messages(void) +{ + (void) memset(&msgs, 0, sizeof (msgs)); +} + +void +add_message(wchar_t *wcs) +{ + char *str; + + if ((str = to_mb_string(wcs)) == NULL) { + INTERR; + return; + } + free(wcs); + + switch (last_kw) { + case T_YESSTR: + msgs.yesstr = str; + break; + case T_NOSTR: + msgs.nostr = str; + break; + case T_YESEXPR: + msgs.yesexpr = str; + break; + case T_NOEXPR: + msgs.noexpr = str; + break; + default: + free(str); + INTERR; + break; + } +} + +void +dump_messages(void) +{ + FILE *f; + char *ptr; + + if (msgs.yesstr == NULL) { + warn("missing field 'yesstr'"); + msgs.yesstr = ""; + } + if (msgs.nostr == NULL) { + warn("missing field 'nostr'"); + msgs.nostr = ""; + } + + /* + * CLDR likes to add : separated lists for yesstr and nostr. + * Legacy Solaris code does not seem to grok this. Fix it. + */ + if ((ptr = strchr(msgs.yesstr, ':')) != NULL) + *ptr = 0; + if ((ptr = strchr(msgs.nostr, ':')) != NULL) + *ptr = 0; + + if ((f = open_category()) == NULL) { + return; + } + + if ((putl_category(msgs.yesexpr, f) == EOF) || + (putl_category(msgs.noexpr, f) == EOF) || + (putl_category(msgs.yesstr, f) == EOF) || + (putl_category(msgs.nostr, f) == EOF)) { + return; + } + close_category(f); +} diff --git a/usr.bin/localedef/monetary.c b/usr.bin/localedef/monetary.c new file mode 100644 index 000000000000..4700a40799b9 --- /dev/null +++ b/usr.bin/localedef/monetary.c @@ -0,0 +1,216 @@ +/* + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * LC_MONETARY database generation routines for localedef. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include "localedef.h" +#include "parser.h" +#include "lmonetary.h" + +static struct lc_monetary_T mon; + +void +init_monetary(void) +{ + (void) memset(&mon, 0, sizeof (mon)); +} + +void +add_monetary_str(wchar_t *wcs) +{ + char *str; + + if ((str = to_mb_string(wcs)) == NULL) { + INTERR; + return; + } + free(wcs); + switch (last_kw) { + case T_INT_CURR_SYMBOL: + mon.int_curr_symbol = str; + break; + case T_CURRENCY_SYMBOL: + mon.currency_symbol = str; + break; + case T_MON_DECIMAL_POINT: + mon.mon_decimal_point = str; + break; + case T_MON_THOUSANDS_SEP: + mon.mon_thousands_sep = str; + break; + case T_POSITIVE_SIGN: + mon.positive_sign = str; + break; + case T_NEGATIVE_SIGN: + mon.negative_sign = str; + break; + default: + free(str); + INTERR; + break; + } +} + +void +add_monetary_num(int n) +{ + char *str = NULL; + + (void) asprintf(&str, "%d", n); + if (str == NULL) { + fprintf(stderr, "out of memory"); + return; + } + + switch (last_kw) { + case T_INT_FRAC_DIGITS: + mon.int_frac_digits = str; + break; + case T_FRAC_DIGITS: + mon.frac_digits = str; + break; + case T_P_CS_PRECEDES: + mon.p_cs_precedes = str; + break; + case T_P_SEP_BY_SPACE: + mon.p_sep_by_space = str; + break; + case T_N_CS_PRECEDES: + mon.n_cs_precedes = str; + break; + case T_N_SEP_BY_SPACE: + mon.n_sep_by_space = str; + break; + case T_P_SIGN_POSN: + mon.p_sign_posn = str; + break; + case T_N_SIGN_POSN: + mon.n_sign_posn = str; + break; + case T_INT_P_CS_PRECEDES: + mon.int_p_cs_precedes = str; + break; + case T_INT_N_CS_PRECEDES: + mon.int_n_cs_precedes = str; + break; + case T_INT_P_SEP_BY_SPACE: + mon.int_p_sep_by_space = str; + break; + case T_INT_N_SEP_BY_SPACE: + mon.int_n_sep_by_space = str; + break; + case T_INT_P_SIGN_POSN: + mon.int_p_sign_posn = str; + break; + case T_INT_N_SIGN_POSN: + mon.int_n_sign_posn = str; + break; + case T_MON_GROUPING: + mon.mon_grouping = str; + break; + default: + INTERR; + break; + } +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + +void +reset_monetary_group(void) +{ + free((char *)mon.mon_grouping); + mon.mon_grouping = NULL; +} + +void +add_monetary_group(int n) +{ + char *s = NULL; + + if (mon.mon_grouping == NULL) { + (void) asprintf(&s, "%d", n); + } else { + (void) asprintf(&s, "%s;%d", mon.mon_grouping, n); + } + if (s == NULL) + fprintf(stderr, "out of memory"); + + free((char *)mon.mon_grouping); + mon.mon_grouping = s; +} + +#pragma GCC diagnostic pop + +void +dump_monetary(void) +{ + FILE *f; + + if ((f = open_category()) == NULL) { + return; + } + + if ((putl_category(mon.int_curr_symbol, f) == EOF) || + (putl_category(mon.currency_symbol, f) == EOF) || + (putl_category(mon.mon_decimal_point, f) == EOF) || + (putl_category(mon.mon_thousands_sep, f) == EOF) || + (putl_category(mon.mon_grouping, f) == EOF) || + (putl_category(mon.positive_sign, f) == EOF) || + (putl_category(mon.negative_sign, f) == EOF) || + (putl_category(mon.int_frac_digits, f) == EOF) || + (putl_category(mon.frac_digits, f) == EOF) || + (putl_category(mon.p_cs_precedes, f) == EOF) || + (putl_category(mon.p_sep_by_space, f) == EOF) || + (putl_category(mon.n_cs_precedes, f) == EOF) || + (putl_category(mon.n_sep_by_space, f) == EOF) || + (putl_category(mon.p_sign_posn, f) == EOF) || + (putl_category(mon.n_sign_posn, f) == EOF) || + (putl_category(mon.int_p_cs_precedes, f) == EOF) || + (putl_category(mon.int_n_cs_precedes, f) == EOF) || + (putl_category(mon.int_p_sep_by_space, f) == EOF) || + (putl_category(mon.int_n_sep_by_space, f) == EOF) || + (putl_category(mon.int_p_sign_posn, f) == EOF) || + (putl_category(mon.int_n_sign_posn, f) == EOF)) { + return; + } + close_category(f); +} diff --git a/usr.bin/localedef/numeric.c b/usr.bin/localedef/numeric.c new file mode 100644 index 000000000000..0a293ff95b66 --- /dev/null +++ b/usr.bin/localedef/numeric.c @@ -0,0 +1,124 @@ +/* + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * LC_NUMERIC database generation routines for localedef. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include "localedef.h" +#include "parser.h" +#include "lnumeric.h" + +static struct lc_numeric_T numeric; + +void +init_numeric(void) +{ + (void) memset(&numeric, 0, sizeof (numeric)); +} + +void +add_numeric_str(wchar_t *wcs) +{ + char *str; + + if ((str = to_mb_string(wcs)) == NULL) { + INTERR; + return; + } + free(wcs); + + switch (last_kw) { + case T_DECIMAL_POINT: + numeric.decimal_point = str; + break; + case T_THOUSANDS_SEP: + numeric.thousands_sep = str; + break; + default: + free(str); + INTERR; + break; + } +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + +void +reset_numeric_group(void) +{ + free((char *)numeric.grouping); + numeric.grouping = NULL; +} + +void +add_numeric_group(int n) +{ + char *s; + + if (numeric.grouping == NULL) { + (void) asprintf(&s, "%d", n); + } else { + (void) asprintf(&s, "%s;%d", numeric.grouping, n); + } + if (s == NULL) + fprintf(stderr, "out of memory"); + + free((char *)numeric.grouping); + numeric.grouping = s; +} + +#pragma GCC diagnostic pop + +void +dump_numeric(void) +{ + FILE *f; + + if ((f = open_category()) == NULL) { + return; + } + + if ((putl_category(numeric.decimal_point, f) == EOF) || + (putl_category(numeric.thousands_sep, f) == EOF) || + (putl_category(numeric.grouping, f) == EOF)) { + return; + } + close_category(f); +} diff --git a/usr.bin/localedef/parser.y b/usr.bin/localedef/parser.y new file mode 100644 index 000000000000..87ff95d5e956 --- /dev/null +++ b/usr.bin/localedef/parser.y @@ -0,0 +1,706 @@ +%{ +/* + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * POSIX localedef grammar. + */ + +#include +#include +#include +#include "localedef.h" + +%} +%union { + int num; + wchar_t wc; + char *token; + collsym_t *collsym; + collelem_t *collelem; +} + +%token T_CODE_SET +%token T_MB_CUR_MAX +%token T_MB_CUR_MIN +%token T_COM_CHAR +%token T_ESC_CHAR +%token T_LT +%token T_GT +%token T_NL +%token T_SEMI +%token T_COMMA +%token T_ELLIPSIS +%token T_RPAREN +%token T_LPAREN +%token T_QUOTE +%token T_NULL +%token T_WS +%token T_END +%token T_COPY +%token T_CHARMAP +%token T_WIDTH +%token T_CTYPE +%token T_ISUPPER +%token T_ISLOWER +%token T_ISALPHA +%token T_ISDIGIT +%token T_ISPUNCT +%token T_ISXDIGIT +%token T_ISSPACE +%token T_ISPRINT +%token T_ISGRAPH +%token T_ISBLANK +%token T_ISCNTRL +%token T_ISALNUM +%token T_ISSPECIAL +%token T_ISPHONOGRAM +%token T_ISIDEOGRAM +%token T_ISENGLISH +%token T_ISNUMBER +%token T_TOUPPER +%token T_TOLOWER +%token T_COLLATE +%token T_COLLATING_SYMBOL +%token T_COLLATING_ELEMENT +%token T_ORDER_START +%token T_ORDER_END +%token T_FORWARD +%token T_BACKWARD +%token T_POSITION +%token T_FROM +%token T_UNDEFINED +%token T_IGNORE +%token T_MESSAGES +%token T_YESSTR +%token T_NOSTR +%token T_YESEXPR +%token T_NOEXPR +%token T_MONETARY +%token T_INT_CURR_SYMBOL +%token T_CURRENCY_SYMBOL +%token T_MON_DECIMAL_POINT +%token T_MON_THOUSANDS_SEP +%token T_POSITIVE_SIGN +%token T_NEGATIVE_SIGN +%token T_MON_GROUPING +%token T_INT_FRAC_DIGITS +%token T_FRAC_DIGITS +%token T_P_CS_PRECEDES +%token T_P_SEP_BY_SPACE +%token T_N_CS_PRECEDES +%token T_N_SEP_BY_SPACE +%token T_P_SIGN_POSN +%token T_N_SIGN_POSN +%token T_INT_P_CS_PRECEDES +%token T_INT_N_CS_PRECEDES +%token T_INT_P_SEP_BY_SPACE +%token T_INT_N_SEP_BY_SPACE +%token T_INT_P_SIGN_POSN +%token T_INT_N_SIGN_POSN +%token T_NUMERIC +%token T_DECIMAL_POINT +%token T_THOUSANDS_SEP +%token T_GROUPING +%token T_TIME +%token T_ABDAY +%token T_DAY +%token T_ABMON +%token T_MON +%token T_ERA +%token T_ERA_D_FMT +%token T_ERA_T_FMT +%token T_ERA_D_T_FMT +%token T_ALT_DIGITS +%token T_D_T_FMT +%token T_D_FMT +%token T_T_FMT +%token T_AM_PM +%token T_T_FMT_AMPM +%token T_DATE_FMT +%token T_CHAR +%token T_NAME +%token T_NUMBER +%token T_SYMBOL +%token T_COLLSYM +%token T_COLLELEM + +%% + +localedef : setting_list categories + | categories + ; + +string : T_QUOTE charlist T_QUOTE + | T_QUOTE T_QUOTE + ; + +charlist : charlist T_CHAR + { + add_wcs($2); + } + | T_CHAR + { + add_wcs($1); + } + ; + +setting_list : setting_list setting + | setting + ; + + +setting : T_COM_CHAR T_CHAR T_NL + { + com_char = $2; + } + | T_ESC_CHAR T_CHAR T_NL + { + esc_char = $2; + } + | T_MB_CUR_MAX T_NUMBER T_NL + { + mb_cur_max = $2; + } + | T_MB_CUR_MIN T_NUMBER T_NL + { + mb_cur_min = $2; + } + | T_CODE_SET string T_NL + { + wchar_t *w = get_wcs(); + set_wide_encoding(to_mb_string(w)); + free(w); + } + | T_CODE_SET T_NAME T_NL + { + set_wide_encoding($2); + } + ; + +copycat : T_COPY T_NAME T_NL + { + copy_category($2); + } + | T_COPY string T_NL + { + wchar_t *w = get_wcs(); + copy_category(to_mb_string(w)); + free(w); + } + ; + +categories : categories category + | category + ; + + +category : charmap + | messages + | monetary + | ctype + | collate + | numeric + | time + ; + + +charmap : T_CHARMAP T_NL charmap_list T_END T_CHARMAP T_NL + | T_WIDTH T_NL width_list T_END T_WIDTH T_NL + ; + + +charmap_list : charmap_list charmap_entry + | charmap_entry + ; + + +charmap_entry : T_SYMBOL T_CHAR + { + add_charmap($1, $2); + scan_to_eol(); + } + | T_SYMBOL T_ELLIPSIS T_SYMBOL T_CHAR + { + add_charmap_range($1, $3, $4); + scan_to_eol(); + } + | T_NL + ; + +width_list : width_list width_entry + | width_entry + ; + +width_entry : T_CHAR T_NUMBER T_NL + { + add_width($1, $2); + } + | T_SYMBOL T_NUMBER T_NL + { + add_charmap_undefined($1); + } + | T_CHAR T_ELLIPSIS T_CHAR T_NUMBER T_NL + { + add_width_range($1, $3, $4); + } + | T_SYMBOL T_ELLIPSIS T_SYMBOL T_NUMBER T_NL + { + add_charmap_undefined($1); + add_charmap_undefined($3); + } + | T_CHAR T_ELLIPSIS T_SYMBOL T_NUMBER T_NL + { + add_width($1, $4); + add_charmap_undefined($3); + } + | T_SYMBOL T_ELLIPSIS T_CHAR T_NUMBER T_NL + { + add_width($3, $4); + add_charmap_undefined($1); + } + | T_NL + ; + +ctype : T_CTYPE T_NL ctype_list T_END T_CTYPE T_NL + { + dump_ctype(); + } + | T_CTYPE T_NL copycat T_END T_CTYPE T_NL + ; + +ctype_list : ctype_list ctype_kw + | ctype_kw + ; + +ctype_kw : T_ISUPPER cc_list T_NL + | T_ISLOWER cc_list T_NL + | T_ISALPHA cc_list T_NL + | T_ISDIGIT cc_list T_NL + | T_ISPUNCT cc_list T_NL + | T_ISXDIGIT cc_list T_NL + | T_ISSPACE cc_list T_NL + | T_ISPRINT cc_list T_NL + | T_ISGRAPH cc_list T_NL + | T_ISBLANK cc_list T_NL + | T_ISCNTRL cc_list T_NL + | T_ISALNUM cc_list T_NL + | T_ISSPECIAL cc_list T_NL + | T_ISENGLISH cc_list T_NL + | T_ISNUMBER cc_list T_NL + | T_ISIDEOGRAM cc_list T_NL + | T_ISPHONOGRAM cc_list T_NL + | T_TOUPPER conv_list T_NL + | T_TOLOWER conv_list T_NL + ; + + +cc_list : cc_list T_SEMI T_CHAR + { + add_ctype($3); + } + | cc_list T_SEMI T_SYMBOL + { + add_charmap_undefined($3); + } + | cc_list T_SEMI T_ELLIPSIS T_SEMI T_CHAR + { + /* note that the endpoints *must* be characters */ + add_ctype_range($5); + } + | T_CHAR + { + add_ctype($1); + } + | T_SYMBOL + { + add_charmap_undefined($1); + } + ; + +conv_list : conv_list T_SEMI conv_pair + | conv_pair + ; + + +conv_pair : T_LPAREN T_CHAR T_COMMA T_CHAR T_RPAREN + { + add_caseconv($2, $4); + } + | T_LPAREN T_SYMBOL T_COMMA T_CHAR T_RPAREN + { + add_charmap_undefined($2); + } + | T_LPAREN T_SYMBOL T_COMMA T_SYMBOL T_RPAREN + { + add_charmap_undefined($2); + add_charmap_undefined($4); + } + | T_LPAREN T_CHAR T_COMMA T_SYMBOL T_RPAREN + { + add_charmap_undefined($4); + } + ; + +collate : T_COLLATE T_NL coll_order T_END T_COLLATE T_NL + { + dump_collate(); + } + | T_COLLATE T_NL coll_optional coll_order T_END T_COLLATE T_NL + { + dump_collate(); + } + | T_COLLATE T_NL copycat T_END T_COLLATE T_NL + ; + + +coll_optional : coll_optional coll_symbols + | coll_optional coll_elements + | coll_symbols + | coll_elements + ; + + +coll_symbols : T_COLLATING_SYMBOL T_SYMBOL T_NL + { + define_collsym($2); + } + ; + + +coll_elements : T_COLLATING_ELEMENT T_SYMBOL T_FROM string T_NL + { + define_collelem($2, get_wcs()); + } + ; + +coll_order : T_ORDER_START T_NL order_list T_ORDER_END T_NL + { + /* If no order list supplied default to one forward */ + add_order_bit(T_FORWARD); + add_order_directive(); + } + | T_ORDER_START order_args T_NL order_list T_ORDER_END T_NL + ; + + +order_args : order_args T_SEMI order_arg + { + add_order_directive(); + } + | order_arg + { + add_order_directive(); + } + ; + +order_arg : order_arg T_COMMA order_dir + | order_dir + ; + +order_dir : T_FORWARD + { + add_order_bit(T_FORWARD); + } + | T_BACKWARD + { + add_order_bit(T_BACKWARD); + } + | T_POSITION + { + add_order_bit(T_POSITION); + } + ; + +order_list : order_list order_item + | order_item + ; + +order_item : T_COLLSYM T_NL + { + end_order_collsym($1); + } + | order_itemkw T_NL + { + end_order(); + } + | order_itemkw order_weights T_NL + { + end_order(); + } + ; + +order_itemkw : T_CHAR + { + start_order_char($1); + } + | T_ELLIPSIS + { + start_order_ellipsis(); + } + | T_COLLELEM + { + start_order_collelem($1); + } + | T_UNDEFINED + { + start_order_undefined(); + } + | T_SYMBOL + { + start_order_symbol($1); + } + ; + +order_weights : order_weights T_SEMI order_weight + | order_weights T_SEMI + | order_weight + ; + +order_weight : T_COLLELEM + { + add_order_collelem($1); + } + | T_COLLSYM + { + add_order_collsym($1); + } + | T_CHAR + { + add_order_char($1); + } + | T_ELLIPSIS + { + add_order_ellipsis(); + } + | T_IGNORE + { + add_order_ignore(); + } + | T_SYMBOL + { + add_order_symbol($1); + } + | T_QUOTE order_str T_QUOTE + { + add_order_subst(); + } + ; + +order_str : order_str order_stritem + | order_stritem + ; + +order_stritem : T_CHAR + { + add_subst_char($1); + } + | T_COLLSYM + { + add_subst_collsym($1); + } + | T_COLLELEM + { + add_subst_collelem($1); + } + | T_SYMBOL + { + add_subst_symbol($1); + } + ; + +messages : T_MESSAGES T_NL messages_list T_END T_MESSAGES T_NL + { + dump_messages(); + } + | T_MESSAGES T_NL copycat T_END T_MESSAGES T_NL + ; + +messages_list : messages_list messages_item + | messages_item + ; + +messages_kw : T_YESSTR + | T_NOSTR + | T_YESEXPR + | T_NOEXPR + ; + +messages_item : messages_kw string T_NL + { + add_message(get_wcs()); + } + ; + +monetary : T_MONETARY T_NL monetary_list T_END T_MONETARY T_NL + { + dump_monetary(); + } + | T_MONETARY T_NL copycat T_END T_MONETARY T_NL + ; + +monetary_list : monetary_list monetary_kw + | monetary_kw + ; + +monetary_strkw : T_INT_CURR_SYMBOL + | T_CURRENCY_SYMBOL + | T_MON_DECIMAL_POINT + | T_MON_THOUSANDS_SEP + | T_POSITIVE_SIGN + | T_NEGATIVE_SIGN + ; + +monetary_numkw : T_INT_FRAC_DIGITS + | T_FRAC_DIGITS + | T_P_CS_PRECEDES + | T_P_SEP_BY_SPACE + | T_N_CS_PRECEDES + | T_N_SEP_BY_SPACE + | T_P_SIGN_POSN + | T_N_SIGN_POSN + | T_INT_P_CS_PRECEDES + | T_INT_N_CS_PRECEDES + | T_INT_P_SEP_BY_SPACE + | T_INT_N_SEP_BY_SPACE + | T_INT_P_SIGN_POSN + | T_INT_N_SIGN_POSN + ; + +monetary_kw : monetary_strkw string T_NL + { + add_monetary_str(get_wcs()); + } + | monetary_numkw T_NUMBER T_NL + { + add_monetary_num($2); + } + | T_MON_GROUPING mon_group_list T_NL + ; + +mon_group_list : T_NUMBER + { + reset_monetary_group(); + add_monetary_group($1); + } + | mon_group_list T_SEMI T_NUMBER + { + add_monetary_group($3); + } + ; + + +numeric : T_NUMERIC T_NL numeric_list T_END T_NUMERIC T_NL + { + dump_numeric(); + } + | T_NUMERIC T_NL copycat T_END T_NUMERIC T_NL + ; + + +numeric_list : numeric_list numeric_item + | numeric_item + ; + + +numeric_item : numeric_strkw string T_NL + { + add_numeric_str(get_wcs()); + } + | T_GROUPING group_list T_NL + ; + +numeric_strkw : T_DECIMAL_POINT + | T_THOUSANDS_SEP + ; + + +group_list : T_NUMBER + { + reset_numeric_group(); + add_numeric_group($1); + } + | group_list T_SEMI T_NUMBER + { + add_numeric_group($3); + } + ; + + +time : T_TIME T_NL time_kwlist T_END T_TIME T_NL + { + dump_time(); + } + | T_TIME T_NL copycat T_END T_NUMERIC T_NL + ; + +time_kwlist : time_kwlist time_kw + | time_kw + ; + +time_kw : time_strkw string T_NL + { + add_time_str(get_wcs()); + } + | time_listkw time_list T_NL + { + check_time_list(); + } + ; + +time_listkw : T_ABDAY + | T_DAY + | T_ABMON + | T_MON + | T_ERA + | T_ALT_DIGITS + | T_AM_PM + ; + +time_strkw : T_ERA_D_T_FMT + | T_ERA_T_FMT + | T_ERA_D_FMT + | T_D_T_FMT + | T_D_FMT + | T_T_FMT + | T_T_FMT_AMPM + | T_DATE_FMT + ; + +time_list : time_list T_SEMI string + { + add_time_list(get_wcs()); + } + | string + { + reset_time_list(); + add_time_list(get_wcs()); + } + ; diff --git a/usr.bin/localedef/scanner.c b/usr.bin/localedef/scanner.c new file mode 100644 index 000000000000..0b57e5bd120e --- /dev/null +++ b/usr.bin/localedef/scanner.c @@ -0,0 +1,866 @@ +/* + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This file contains the "scanner", which tokenizes the input files + * for localedef for processing by the higher level grammar processor. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include "localedef.h" +#include "parser.h" + +int com_char = '#'; +int esc_char = '\\'; +int mb_cur_min = 1; +int mb_cur_max = 1; +int lineno = 1; +int warnings = 0; +int is_stdin = 1; +FILE *input; +static int nextline; +//static FILE *input = stdin; +static const char *filename = ""; +static int instring = 0; +static int escaped = 0; + +/* + * Token space ... grows on demand. + */ +static char *token = NULL; +static int tokidx; +static int toksz = 0; +static int hadtok = 0; + +/* + * Wide string space ... grows on demand. + */ +static wchar_t *widestr = NULL; +static int wideidx = 0; +static int widesz = 0; + +/* + * The last keyword seen. This is useful to trigger the special lexer rules + * for "copy" and also collating symbols and elements. + */ +int last_kw = 0; +static int category = T_END; + +static struct token { + int id; + const char *name; +} keywords[] = { + { T_COM_CHAR, "comment_char" }, + { T_ESC_CHAR, "escape_char" }, + { T_END, "END" }, + { T_COPY, "copy" }, + { T_MESSAGES, "LC_MESSAGES" }, + { T_YESSTR, "yesstr" }, + { T_YESEXPR, "yesexpr" }, + { T_NOSTR, "nostr" }, + { T_NOEXPR, "noexpr" }, + { T_MONETARY, "LC_MONETARY" }, + { T_INT_CURR_SYMBOL, "int_curr_symbol" }, + { T_CURRENCY_SYMBOL, "currency_symbol" }, + { T_MON_DECIMAL_POINT, "mon_decimal_point" }, + { T_MON_THOUSANDS_SEP, "mon_thousands_sep" }, + { T_POSITIVE_SIGN, "positive_sign" }, + { T_NEGATIVE_SIGN, "negative_sign" }, + { T_MON_GROUPING, "mon_grouping" }, + { T_INT_FRAC_DIGITS, "int_frac_digits" }, + { T_FRAC_DIGITS, "frac_digits" }, + { T_P_CS_PRECEDES, "p_cs_precedes" }, + { T_P_SEP_BY_SPACE, "p_sep_by_space" }, + { T_N_CS_PRECEDES, "n_cs_precedes" }, + { T_N_SEP_BY_SPACE, "n_sep_by_space" }, + { T_P_SIGN_POSN, "p_sign_posn" }, + { T_N_SIGN_POSN, "n_sign_posn" }, + { T_INT_P_CS_PRECEDES, "int_p_cs_precedes" }, + { T_INT_N_CS_PRECEDES, "int_n_cs_precedes" }, + { T_INT_P_SEP_BY_SPACE, "int_p_sep_by_space" }, + { T_INT_N_SEP_BY_SPACE, "int_n_sep_by_space" }, + { T_INT_P_SIGN_POSN, "int_p_sign_posn" }, + { T_INT_N_SIGN_POSN, "int_n_sign_posn" }, + { T_COLLATE, "LC_COLLATE" }, + { T_COLLATING_SYMBOL, "collating-symbol" }, + { T_COLLATING_ELEMENT, "collating-element" }, + { T_FROM, "from" }, + { T_ORDER_START, "order_start" }, + { T_ORDER_END, "order_end" }, + { T_FORWARD, "forward" }, + { T_BACKWARD, "backward" }, + { T_POSITION, "position" }, + { T_IGNORE, "IGNORE" }, + { T_UNDEFINED, "UNDEFINED" }, + { T_NUMERIC, "LC_NUMERIC" }, + { T_DECIMAL_POINT, "decimal_point" }, + { T_THOUSANDS_SEP, "thousands_sep" }, + { T_GROUPING, "grouping" }, + { T_TIME, "LC_TIME" }, + { T_ABDAY, "abday" }, + { T_DAY, "day" }, + { T_ABMON, "abmon" }, + { T_MON, "mon" }, + { T_D_T_FMT, "d_t_fmt" }, + { T_D_FMT, "d_fmt" }, + { T_T_FMT, "t_fmt" }, + { T_AM_PM, "am_pm" }, + { T_T_FMT_AMPM, "t_fmt_ampm" }, + { T_ERA, "era" }, + { T_ERA_D_FMT, "era_d_fmt" }, + { T_ERA_T_FMT, "era_t_fmt" }, + { T_ERA_D_T_FMT, "era_d_t_fmt" }, + { T_ALT_DIGITS, "alt_digits" }, + { T_CTYPE, "LC_CTYPE" }, + { T_ISUPPER, "upper" }, + { T_ISLOWER, "lower" }, + { T_ISALPHA, "alpha" }, + { T_ISDIGIT, "digit" }, + { T_ISPUNCT, "punct" }, + { T_ISXDIGIT, "xdigit" }, + { T_ISSPACE, "space" }, + { T_ISPRINT, "print" }, + { T_ISGRAPH, "graph" }, + { T_ISBLANK, "blank" }, + { T_ISCNTRL, "cntrl" }, + /* + * These entries are local additions, and not specified by + * TOG. Note that they are not guaranteed to be accurate for + * all locales, and so applications should not depend on them. + */ + { T_ISSPECIAL, "special" }, + { T_ISENGLISH, "english" }, + { T_ISPHONOGRAM, "phonogram" }, + { T_ISIDEOGRAM, "ideogram" }, + { T_ISNUMBER, "number" }, + /* + * We have to support this in the grammar, but it would be a + * syntax error to define a character as one of these without + * also defining it as an alpha or digit. We ignore it in our + * parsing. + */ + { T_ISALNUM, "alnum" }, + { T_TOUPPER, "toupper" }, + { T_TOLOWER, "tolower" }, + + /* + * These are keywords used in the charmap file. Note that + * Solaris orginally used angle brackets to wrap some of them, + * but we removed that to simplify our parser. The first of these + * items are "global items." + */ + { T_CHARMAP, "CHARMAP" }, + { T_WIDTH, "WIDTH" }, + + { -1, NULL }, +}; + +/* + * These special words are only used in a charmap file, enclosed in <>. + */ +static struct token symwords[] = { + { T_COM_CHAR, "comment_char" }, + { T_ESC_CHAR, "escape_char" }, + { T_CODE_SET, "code_set_name" }, + { T_MB_CUR_MAX, "mb_cur_max" }, + { T_MB_CUR_MIN, "mb_cur_min" }, + { -1, NULL }, +}; + +static int categories[] = { + T_CHARMAP, + T_CTYPE, + T_COLLATE, + T_MESSAGES, + T_MONETARY, + T_NUMERIC, + T_TIME, + T_WIDTH, + 0 +}; + +void +reset_scanner(const char *fname) +{ + if (fname == NULL) { + filename = ""; + is_stdin = 1; + } else { + if (!is_stdin) + (void) fclose(input); + if ((input = fopen(fname, "r")) == NULL) { + perror("fopen"); + exit(4); + } else { + is_stdin = 0; + } + filename = fname; + } + com_char = '#'; + esc_char = '\\'; + instring = 0; + escaped = 0; + lineno = 1; + nextline = 1; + tokidx = 0; + wideidx = 0; +} + +#define hex(x) \ + (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10)) +#define isodigit(x) ((x >= '0') && (x <= '7')) + +static int +scanc(void) +{ + int c; + + if (is_stdin) + c = getc(stdin); + else + c = getc(input); + lineno = nextline; + if (c == '\n') { + nextline++; + } + return (c); +} + +static void +unscanc(int c) +{ + if (c == '\n') { + nextline--; + } + if (ungetc(c, is_stdin ? stdin : input) < 0) { + yyerror("ungetc failed"); + } +} + +static int +scan_hex_byte(void) +{ + int c1, c2; + int v; + + c1 = scanc(); + if (!isxdigit(c1)) { + yyerror("malformed hex digit"); + return (0); + } + c2 = scanc(); + if (!isxdigit(c2)) { + yyerror("malformed hex digit"); + return (0); + } + v = ((hex(c1) << 4) | hex(c2)); + return (v); +} + +static int +scan_dec_byte(void) +{ + int c1, c2, c3; + int b; + + c1 = scanc(); + if (!isdigit(c1)) { + yyerror("malformed decimal digit"); + return (0); + } + b = c1 - '0'; + c2 = scanc(); + if (!isdigit(c2)) { + yyerror("malformed decimal digit"); + return (0); + } + b *= 10; + b += (c2 - '0'); + c3 = scanc(); + if (!isdigit(c3)) { + unscanc(c3); + } else { + b *= 10; + b += (c3 - '0'); + } + return (b); +} + +static int +scan_oct_byte(void) +{ + int c1, c2, c3; + int b; + + b = 0; + + c1 = scanc(); + if (!isodigit(c1)) { + yyerror("malformed octal digit"); + return (0); + } + b = c1 - '0'; + c2 = scanc(); + if (!isodigit(c2)) { + yyerror("malformed octal digit"); + return (0); + } + b *= 8; + b += (c2 - '0'); + c3 = scanc(); + if (!isodigit(c3)) { + unscanc(c3); + } else { + b *= 8; + b += (c3 - '0'); + } + return (b); +} + +void +add_tok(int c) +{ + if ((tokidx + 1) >= toksz) { + toksz += 64; + if ((token = realloc(token, toksz)) == NULL) { + yyerror("out of memory"); + tokidx = 0; + toksz = 0; + return; + } + } + + token[tokidx++] = (char)c; + token[tokidx] = 0; +} +void +add_wcs(wchar_t c) +{ + if ((wideidx + 1) >= widesz) { + widesz += 64; + widestr = realloc(widestr, (widesz * sizeof (wchar_t))); + if (widestr == NULL) { + yyerror("out of memory"); + wideidx = 0; + widesz = 0; + return; + } + } + + widestr[wideidx++] = c; + widestr[wideidx] = 0; +} + +wchar_t * +get_wcs(void) +{ + wchar_t *ws = widestr; + wideidx = 0; + widestr = NULL; + widesz = 0; + if (ws == NULL) { + if ((ws = wcsdup(L"")) == NULL) { + yyerror("out of memory"); + } + } + return (ws); +} + +static int +get_byte(void) +{ + int c; + + if ((c = scanc()) != esc_char) { + unscanc(c); + return (EOF); + } + c = scanc(); + + switch (c) { + case 'd': + case 'D': + return (scan_dec_byte()); + case 'x': + case 'X': + return (scan_hex_byte()); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + /* put the character back so we can get it */ + unscanc(c); + return (scan_oct_byte()); + default: + unscanc(c); + unscanc(esc_char); + return (EOF); + } +} + +int +get_escaped(int c) +{ + switch (c) { + case 'n': + return ('\n'); + case 'r': + return ('\r'); + case 't': + return ('\t'); + case 'f': + return ('\f'); + case 'v': + return ('\v'); + case 'b': + return ('\b'); + case 'a': + return ('\a'); + default: + return (c); + } +} + +int +get_wide(void) +{ + static char mbs[MB_LEN_MAX + 1] = ""; + static int mbi = 0; + int c; + wchar_t wc; + + if (mb_cur_max >= (int)sizeof (mbs)) { + yyerror("max multibyte character size too big"); + mbi = 0; + return (T_NULL); + } + for (;;) { + if ((mbi == mb_cur_max) || ((c = get_byte()) == EOF)) { + /* + * end of the byte sequence reached, but no + * valid wide decoding. fatal error. + */ + mbi = 0; + yyerror("not a valid character encoding"); + return (T_NULL); + } + mbs[mbi++] = c; + mbs[mbi] = 0; + + /* does it decode? */ + if (to_wide(&wc, mbs) >= 0) { + break; + } + } + + mbi = 0; + if ((category != T_CHARMAP) && (category != T_WIDTH)) { + if (check_charmap(wc) < 0) { + yyerror("no symbolic name for character"); + return (T_NULL); + } + } + + yylval.wc = wc; + return (T_CHAR); +} + +int +get_symbol(void) +{ + int c; + + while ((c = scanc()) != EOF) { + if (escaped) { + escaped = 0; + if (c == '\n') + continue; + add_tok(get_escaped(c)); + continue; + } + if (c == esc_char) { + escaped = 1; + continue; + } + if (c == '\n') { /* well that's strange! */ + yyerror("unterminated symbolic name"); + continue; + } + if (c == '>') { /* end of symbol */ + + /* + * This restarts the token from the beginning + * the next time we scan a character. (This + * token is complete.) + */ + + if (token == NULL) { + yyerror("missing symbolic name"); + return (T_NULL); + } + tokidx = 0; + + /* + * A few symbols are handled as keywords outside + * of the normal categories. + */ + if (category == T_END) { + int i; + for (i = 0; symwords[i].name != 0; i++) { + if (strcmp(token, symwords[i].name) == + 0) { + last_kw = symwords[i].id; + return (last_kw); + } + } + } + /* + * Contextual rule: Only literal characters are + * permitted in CHARMAP. Anywhere else the symbolic + * forms are fine. + */ + if ((category != T_CHARMAP) && + (lookup_charmap(token, &yylval.wc)) != -1) { + return (T_CHAR); + } + if ((yylval.collsym = lookup_collsym(token)) != NULL) { + return (T_COLLSYM); + } + if ((yylval.collelem = lookup_collelem(token)) != + NULL) { + return (T_COLLELEM); + } + /* its an undefined symbol */ + yylval.token = strdup(token); + token = NULL; + toksz = 0; + tokidx = 0; + return (T_SYMBOL); + } + add_tok(c); + } + + yyerror("unterminated symbolic name"); + return (EOF); +} + +int +get_category(void) +{ + return (category); +} + +static int +consume_token(void) +{ + int len = tokidx; + int i; + + tokidx = 0; + if (token == NULL) + return (T_NULL); + + /* + * this one is special, because we don't want it to alter the + * last_kw field. + */ + if (strcmp(token, "...") == 0) { + return (T_ELLIPSIS); + } + + /* search for reserved words first */ + for (i = 0; keywords[i].name; i++) { + int j; + if (strcmp(keywords[i].name, token) != 0) { + continue; + } + + last_kw = keywords[i].id; + + /* clear the top level category if we're done with it */ + if (last_kw == T_END) { + category = T_END; + } + + /* set the top level category if we're changing */ + for (j = 0; categories[j]; j++) { + if (categories[j] != last_kw) + continue; + category = last_kw; + } + + return (keywords[i].id); + } + + /* maybe its a numeric constant? */ + if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) { + char *eptr; + yylval.num = strtol(token, &eptr, 10); + if (*eptr != 0) + yyerror("malformed number"); + return (T_NUMBER); + } + + /* + * A single lone character is treated as a character literal. + * To avoid duplication of effort, we stick in the charmap. + */ + if (len == 1) { + yylval.wc = token[0]; + return (T_CHAR); + } + + /* anything else is treated as a symbolic name */ + yylval.token = strdup(token); + token = NULL; + toksz = 0; + tokidx = 0; + return (T_NAME); +} + +void +scan_to_eol(void) +{ + int c; + while ((c = scanc()) != '\n') { + if (c == EOF) { + /* end of file without newline! */ + errf("missing newline"); + return; + } + } + assert(c == '\n'); +} + +int +yylex(void) +{ + int c; + + while ((c = scanc()) != EOF) { + + /* special handling for quoted string */ + if (instring) { + if (escaped) { + escaped = 0; + + /* if newline, just eat and forget it */ + if (c == '\n') + continue; + + if (strchr("xXd01234567", c)) { + unscanc(c); + unscanc(esc_char); + return (get_wide()); + } + yylval.wc = get_escaped(c); + return (T_CHAR); + } + if (c == esc_char) { + escaped = 1; + continue; + } + switch (c) { + case '<': + return (get_symbol()); + case '>': + /* oops! should generate syntax error */ + return (T_GT); + case '"': + instring = 0; + return (T_QUOTE); + default: + yylval.wc = c; + return (T_CHAR); + } + } + + /* escaped characters first */ + if (escaped) { + escaped = 0; + if (c == '\n') { + /* eat the newline */ + continue; + } + hadtok = 1; + if (tokidx) { + /* an escape mid-token is nonsense */ + return (T_NULL); + } + + /* numeric escapes are treated as wide characters */ + if (strchr("xXd01234567", c)) { + unscanc(c); + unscanc(esc_char); + return (get_wide()); + } + + add_tok(get_escaped(c)); + continue; + } + + /* if it is the escape charter itself note it */ + if (c == esc_char) { + escaped = 1; + continue; + } + + /* remove from the comment char to end of line */ + if (c == com_char) { + while (c != '\n') { + if ((c = scanc()) == EOF) { + /* end of file without newline! */ + return (EOF); + } + } + assert(c == '\n'); + if (!hadtok) { + /* + * If there were no tokens on this line, + * then just pretend it didn't exist at all. + */ + continue; + } + hadtok = 0; + return (T_NL); + } + + if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) { + /* + * These are all token delimiters. If there + * is a token already in progress, we need to + * process it. + */ + unscanc(c); + return (consume_token()); + } + + switch (c) { + case '\n': + if (!hadtok) { + /* + * If the line was completely devoid of tokens, + * then just ignore it. + */ + continue; + } + /* we're starting a new line, reset the token state */ + hadtok = 0; + return (T_NL); + case ',': + hadtok = 1; + return (T_COMMA); + case ';': + hadtok = 1; + return (T_SEMI); + case '(': + hadtok = 1; + return (T_LPAREN); + case ')': + hadtok = 1; + return (T_RPAREN); + case '>': + hadtok = 1; + return (T_GT); + case '<': + /* symbol start! */ + hadtok = 1; + return (get_symbol()); + case ' ': + case '\t': + /* whitespace, just ignore it */ + continue; + case '"': + hadtok = 1; + instring = 1; + return (T_QUOTE); + default: + hadtok = 1; + add_tok(c); + continue; + } + } + return (EOF); +} + +void +yyerror(const char *msg) +{ + (void) fprintf(stderr, "%s: %d: error: %s\n", + filename, lineno, msg); + exit(4); +} + +void +errf(const char *fmt, ...) +{ + char *msg; + + va_list va; + va_start(va, fmt); + (void) vasprintf(&msg, fmt, va); + va_end(va); + + (void) fprintf(stderr, "%s: %d: error: %s\n", + filename, lineno, msg); + free(msg); + exit(4); +} + +void +warn(const char *fmt, ...) +{ + char *msg; + + va_list va; + va_start(va, fmt); + (void) vasprintf(&msg, fmt, va); + va_end(va); + + (void) fprintf(stderr, "%s: %d: warning: %s\n", + filename, lineno, msg); + free(msg); + warnings++; + if (!warnok) + exit(4); +} diff --git a/usr.bin/localedef/time.c b/usr.bin/localedef/time.c new file mode 100644 index 000000000000..8a8c7107918f --- /dev/null +++ b/usr.bin/localedef/time.c @@ -0,0 +1,280 @@ +/* + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * LC_TIME database generation routines for localedef. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include "localedef.h" +#include "parser.h" +#include "timelocal.h" + +struct lc_time_T tm; + +void +init_time(void) +{ + (void) memset(&tm, 0, sizeof (tm)); +} + +void +add_time_str(wchar_t *wcs) +{ + char *str; + + if ((str = to_mb_string(wcs)) == NULL) { + INTERR; + return; + } + free(wcs); + + switch (last_kw) { + case T_D_T_FMT: + tm.c_fmt = str; + break; + case T_D_FMT: + tm.x_fmt = str; + break; + case T_T_FMT: + tm.X_fmt = str; + break; + case T_T_FMT_AMPM: + tm.ampm_fmt = str; + break; + case T_DATE_FMT: + /* + * This one is a Solaris extension, Too bad date just + * doesn't use %c, which would be simpler. + */ + tm.date_fmt = str; + break; + case T_ERA_D_FMT: + case T_ERA_T_FMT: + case T_ERA_D_T_FMT: + /* Silently ignore it. */ + break; + default: + free(str); + INTERR; + break; + } +} + +static void +add_list(const char *ptr[], char *str, int limit) +{ + int i; + for (i = 0; i < limit; i++) { + if (ptr[i] == NULL) { + ptr[i] = str; + return; + } + } + fprintf(stderr,"too many list elements"); +} + +void +add_time_list(wchar_t *wcs) +{ + char *str; + + if ((str = to_mb_string(wcs)) == NULL) { + INTERR; + return; + } + free(wcs); + + switch (last_kw) { + case T_ABMON: + add_list(tm.mon, str, 12); + break; + case T_MON: + add_list(tm.month, str, 12); + break; + case T_ABDAY: + add_list(tm.wday, str, 7); + break; + case T_DAY: + add_list(tm.weekday, str, 7); + break; + case T_AM_PM: + if (tm.am == NULL) { + tm.am = str; + } else if (tm.pm == NULL) { + tm.pm = str; + } else { + fprintf(stderr,"too many list elements"); + } + break; + case T_ALT_DIGITS: + case T_ERA: + free(str); + break; + default: + free(str); + INTERR; + break; + } +} + +void +check_time_list(void) +{ + switch (last_kw) { + case T_ABMON: + if (tm.mon[11] != NULL) + return; + break; + case T_MON: + if (tm.month[11] != NULL) + return; + break; + case T_ABDAY: + if (tm.wday[6] != NULL) + return; + break; + case T_DAY: + if (tm.weekday[6] != NULL) + return; + break; + case T_AM_PM: + if (tm.pm != NULL) + return; + break; + case T_ERA: + case T_ALT_DIGITS: + return; + default: + fprintf(stderr,"unknown list"); + break; + } + + fprintf(stderr,"too few items in list (%d)", last_kw); +} + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + +void +reset_time_list(void) +{ + int i; + switch (last_kw) { + case T_ABMON: + for (i = 0; i < 12; i++) { + free((char *)tm.mon[i]); + tm.mon[i] = NULL; + } + break; + case T_MON: + for (i = 0; i < 12; i++) { + free((char *)tm.month[i]); + tm.month[i] = NULL; + } + break; + case T_ABDAY: + for (i = 0; i < 7; i++) { + free((char *)tm.wday[i]); + tm.wday[i] = NULL; + } + break; + case T_DAY: + for (i = 0; i < 7; i++) { + free((char *)tm.weekday[i]); + tm.weekday[i] = NULL; + } + break; + case T_AM_PM: + free((char *)tm.am); + tm.am = NULL; + free((char *)tm.pm); + tm.pm = NULL; + break; + } +} + +#pragma GCC diagnostic pop + +void +dump_time(void) +{ + FILE *f; + int i; + + if ((f = open_category()) == NULL) { + return; + } + + for (i = 0; i < 12; i++) { + if (putl_category(tm.mon[i], f) == EOF) { + return; + } + } + for (i = 0; i < 12; i++) { + if (putl_category(tm.month[i], f) == EOF) { + return; + } + } + for (i = 0; i < 7; i++) { + if (putl_category(tm.wday[i], f) == EOF) { + return; + } + } + for (i = 0; i < 7; i++) { + if (putl_category(tm.weekday[i], f) == EOF) { + return; + } + } + + /* + * NOTE: If date_fmt is not specified, then we'll default to + * using the %c for date. This is reasonable for most + * locales, although for reasons that I don't understand + * Solaris historically has had a seperate format for date. + */ + if ((putl_category(tm.X_fmt, f) == EOF) || + (putl_category(tm.x_fmt, f) == EOF) || + (putl_category(tm.c_fmt, f) == EOF) || + (putl_category(tm.am, f) == EOF) || + (putl_category(tm.pm, f) == EOF) || + (putl_category(tm.date_fmt ? tm.date_fmt : tm.c_fmt, f) == EOF) || + (putl_category(tm.ampm_fmt, f) == EOF)) { + return; + } + close_category(f); +} diff --git a/usr.bin/localedef/wide.c b/usr.bin/localedef/wide.c new file mode 100644 index 000000000000..1c57fed2de7c --- /dev/null +++ b/usr.bin/localedef/wide.c @@ -0,0 +1,669 @@ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 Garrett D'Amore All rights reserved. + * Copyright 2015 John Marino + * + * This source code is derived from the illumos localedef command, and + * provided under BSD-style license terms by Nexenta Systems, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * The functions in this file convert from the standard multibyte forms + * to the wide character forms used internally by libc. Unfortunately, + * this approach means that we need a method for each and every encoding. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include "localedef.h" + +static int towide_none(wchar_t *, const char *, unsigned); +static int towide_utf8(wchar_t *, const char *, unsigned); +static int towide_big5(wchar_t *, const char *, unsigned); +static int towide_gbk(wchar_t *, const char *, unsigned); +static int towide_gb2312(wchar_t *, const char *, unsigned); +static int towide_gb18030(wchar_t *, const char *, unsigned); +static int towide_mskanji(wchar_t *, const char *, unsigned); +static int towide_euccn(wchar_t *, const char *, unsigned); +static int towide_eucjp(wchar_t *, const char *, unsigned); +static int towide_euckr(wchar_t *, const char *, unsigned); +static int towide_euctw(wchar_t *, const char *, unsigned); + +static int tomb_none(char *, wchar_t); +static int tomb_utf8(char *, wchar_t); +static int tomb_mbs(char *, wchar_t); + +static int (*_towide)(wchar_t *, const char *, unsigned) = towide_none; +static int (*_tomb)(char *, wchar_t) = tomb_none; +static const char *_encoding = "NONE"; +static int _nbits = 7; + +/* + * Table of supported encodings. We only bother to list the multibyte + * encodings here, because single byte locales are handed by "NONE". + */ +static struct { + const char *name; + /* the name that the underlying libc implemenation uses */ + const char *cname; + /* the maximum number of bits required for priorities */ + int nbits; + int (*towide)(wchar_t *, const char *, unsigned); + int (*tomb)(char *, wchar_t); +} mb_encodings[] = { + /* + * UTF8 values max out at 0x1fffff (although in theory there could + * be later extensions, but it won't happen.) This means we only need + * 21 bits to be able to encode the entire range of priorities. + */ + { "UTF-8", "UTF-8", 21, towide_utf8, tomb_utf8 }, + { "UTF8", "UTF-8", 21, towide_utf8, tomb_utf8 }, + { "utf8", "UTF-8", 21, towide_utf8, tomb_utf8 }, + { "utf-8", "UTF-8", 21, towide_utf8, tomb_utf8 }, + + { "EUC-CN", "EUC-CN", 16, towide_euccn, tomb_mbs }, + { "eucCN", "EUC-CN", 16, towide_euccn, tomb_mbs }, + /* + * Becuase the 3-byte form of EUC-JP use the same leading byte, + * only 17 bits required to provide unique priorities. (The low + * bit of that first byte is set.) By setting this value low, + * we can get by with only 3 bytes in the strxfrm expansion. + */ + { "EUC-JP", "EUC-JP", 17, towide_eucjp, tomb_mbs }, + { "eucJP", "EUC-JP", 17, towide_eucjp, tomb_mbs }, + + { "EUC-KR", "EUC-KR", 16, towide_euckr, tomb_mbs }, + { "eucKR", "EUC-KR", 16, towide_euckr, tomb_mbs }, + /* + * EUC-TW uses 2 bytes most of the time, but 4 bytes if the + * high order byte is 0x8E. However, with 4 byte encodings, + * the third byte will be A0-B0. So we only need to consider + * the lower order 24 bits for collation. + */ + { "EUC-TW", "EUC-TW", 24, towide_euctw, tomb_mbs }, + { "eucTW", "EUC-TW", 24, towide_euctw, tomb_mbs }, + + { "MS_Kanji", "MSKanji", 16, towide_mskanji, tomb_mbs }, + { "MSKanji", "MSKanji", 16, towide_mskanji, tomb_mbs }, + { "PCK", "MSKanji", 16, towide_mskanji, tomb_mbs }, + { "SJIS", "MSKanji", 16, towide_mskanji, tomb_mbs }, + { "Shift_JIS", "MSKanji", 16, towide_mskanji, tomb_mbs }, + + { "BIG5", "BIG5", 16, towide_big5, tomb_mbs }, + { "big5", "BIG5", 16, towide_big5, tomb_mbs }, + { "Big5", "BIG5", 16, towide_big5, tomb_mbs }, + + { "GBK", "GBK", 16, towide_gbk, tomb_mbs }, + + /* + * GB18030 can get away with just 31 bits. This is because the + * high order bit is always set for 4 byte values, and the + * at least one of the other bits in that 4 byte value will + * be non-zero. + */ + { "GB18030", "GB18030", 31, towide_gb18030, tomb_mbs }, + + /* + * This should probably be an aliase for euc-cn, or vice versa. + */ + { "GB2312", "GB2312", 16, towide_gb2312, tomb_mbs }, + + { NULL, NULL, 0, 0, 0 }, +}; + +static char * +show_mb(const char *mb) +{ + static char buf[64]; + + /* ASCII stuff we just print */ + if (isascii(*mb) && isgraph(*mb)) { + buf[0] = *mb; + buf[1] = 0; + return (buf); + } + buf[0] = 0; + while (*mb != 0) { + char scr[8]; + (void) snprintf(scr, sizeof (scr), "\\x%02x", *mb); + (void) strlcat(buf, scr, sizeof (buf)); + mb++; + } + return (buf); +} + +static char *widemsg; + +void +werr(const char *fmt, ...) +{ + char *msg; + + va_list va; + va_start(va, fmt); + (void) vasprintf(&msg, fmt, va); + va_end(va); + + free(widemsg); + widemsg = msg; +} + +/* + * This is used for 8-bit encodings. + */ +int +towide_none(wchar_t *c, const char *mb, unsigned n __unused) +{ + if (mb_cur_max != 1) { + werr("invalid or unsupported multibyte locale"); + return (-1); + } + *c = (uint8_t)*mb; + return (1); +} + +int +tomb_none(char *mb, wchar_t wc) +{ + if (mb_cur_max != 1) { + werr("invalid or unsupported multibyte locale"); + return (-1); + } + *(uint8_t *)mb = (wc & 0xff); + mb[1] = 0; + return (1); +} + +/* + * UTF-8 stores wide characters in UTF-32 form. + */ +int +towide_utf8(wchar_t *wc, const char *mb, unsigned n) +{ + wchar_t c; + int nb; + int lv; /* lowest legal value */ + int i; + const uint8_t *s = (const uint8_t *)mb; + + c = *s; + + if ((c & 0x80) == 0) { + /* 7-bit ASCII */ + *wc = c; + return (1); + } else if ((c & 0xe0) == 0xc0) { + /* u80-u7ff - two bytes encoded */ + nb = 2; + lv = 0x80; + c &= ~0xe0; + } else if ((c & 0xf0) == 0xe0) { + /* u800-uffff - three bytes encoded */ + nb = 3; + lv = 0x800; + c &= ~0xf0; + } else if ((c & 0xf8) == 0xf0) { + /* u1000-u1fffff - four bytes encoded */ + nb = 4; + lv = 0x1000; + c &= ~0xf8; + } else { + /* 5 and 6 byte encodings are not legal unicode */ + werr("utf8 encoding too large (%s)", show_mb(mb)); + return (-1); + } + if (nb > (int)n) { + werr("incomplete utf8 sequence (%s)", show_mb(mb)); + return (-1); + } + + for (i = 1; i < nb; i++) { + if (((s[i]) & 0xc0) != 0x80) { + werr("illegal utf8 byte (%x)", s[i]); + return (-1); + } + c <<= 6; + c |= (s[i] & 0x3f); + } + + if (c < lv) { + werr("illegal redundant utf8 encoding (%s)", show_mb(mb)); + return (-1); + } + *wc = c; + return (nb); +} + +int +tomb_utf8(char *mb, wchar_t wc) +{ + uint8_t *s = (uint8_t *)mb; + uint8_t msk; + int cnt; + int i; + + if (wc <= 0x7f) { + s[0] = wc & 0x7f; + s[1] = 0; + return (1); + } + if (wc <= 0x7ff) { + cnt = 2; + msk = 0xc0; + } else if (wc <= 0xffff) { + cnt = 3; + msk = 0xe0; + } else if (wc <= 0x1fffff) { + cnt = 4; + msk = 0xf0; + } else { + werr("illegal uf8 char (%x)", wc); + return (-1); + } + for (i = cnt - 1; i; i--) { + s[i] = (wc & 0x3f) | 0x80; + wc >>= 6; + } + s[0] = (msk) | wc; + s[cnt] = 0; + return (cnt); +} + +/* + * Several encodings share a simplistic dual byte encoding. In these + * forms, they all indicate that a two byte sequence is to be used if + * the first byte has its high bit set. They all store this simple + * encoding as a 16-bit value, although a great many of the possible + * code points are not used in most character sets. This gives a possible + * set of just over 32,000 valid code points. + * + * 0x00 - 0x7f - 1 byte encoding + * 0x80 - 0x7fff - illegal + * 0x8000 - 0xffff - 2 byte encoding + */ + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-qual" + +static int +towide_dbcs(wchar_t *wc, const char *mb, unsigned n) +{ + wchar_t c; + + c = *(uint8_t *)mb; + + if ((c & 0x80) == 0) { + /* 7-bit */ + *wc = c; + return (1); + } + if (n < 2) { + werr("incomplete character sequence (%s)", show_mb(mb)); + return (-1); + } + + /* Store both bytes as a single 16-bit wide. */ + c <<= 8; + c |= (uint8_t)(mb[1]); + *wc = c; + return (2); +} + +/* + * Most multibyte locales just convert the wide character to the multibyte + * form by stripping leading null bytes, and writing the 32-bit quantity + * in big-endian order. + */ +int +tomb_mbs(char *mb, wchar_t wc) +{ + uint8_t *s = (uint8_t *)mb; + int n = 0, c; + + if ((wc & 0xff000000U) != 0) { + n = 4; + } else if ((wc & 0x00ff0000U) != 0) { + n = 3; + } else if ((wc & 0x0000ff00U) != 0) { + n = 2; + } else { + n = 1; + } + c = n; + while (n) { + n--; + s[n] = wc & 0xff; + wc >>= 8; + } + /* ensure null termination */ + s[c] = 0; + return (c); +} + + +/* + * big5 is a simple dual byte character set. + */ +int +towide_big5(wchar_t *wc, const char *mb, unsigned n) +{ + return (towide_dbcs(wc, mb, n)); +} + +/* + * GBK encodes wides in the same way that big5 does, the high order + * bit of the first byte indicates a double byte character. + */ +int +towide_gbk(wchar_t *wc, const char *mb, unsigned n) +{ + return (towide_dbcs(wc, mb, n)); +} + +/* + * GB2312 is another DBCS. Its cleaner than others in that the second + * byte does not encode ASCII, but it supports characters. + */ +int +towide_gb2312(wchar_t *wc, const char *mb, unsigned n) +{ + return (towide_dbcs(wc, mb, n)); +} + +/* + * GB18030. This encodes as 8, 16, or 32-bits. + * 7-bit values are in 1 byte, 4 byte sequences are used when + * the second byte encodes 0x30-39 and all other sequences are 2 bytes. + */ +int +towide_gb18030(wchar_t *wc, const char *mb, unsigned n) +{ + wchar_t c; + + c = *(uint8_t *)mb; + + if ((c & 0x80) == 0) { + /* 7-bit */ + *wc = c; + return (1); + } + if (n < 2) { + werr("incomplete character sequence (%s)", show_mb(mb)); + return (-1); + } + + /* pull in the second byte */ + c <<= 8; + c |= (uint8_t)(mb[1]); + + if (((c & 0xff) >= 0x30) && ((c & 0xff) <= 0x39)) { + if (n < 4) { + werr("incomplete 4-byte character sequence (%s)", + show_mb(mb)); + return (-1); + } + c <<= 8; + c |= (uint8_t)(mb[2]); + c <<= 8; + c |= (uint8_t)(mb[3]); + *wc = c; + return (4); + } + + *wc = c; + return (2); +} + +/* + * MS-Kanji (aka SJIS) is almost a clean DBCS like the others, but it + * also has a range of single byte characters above 0x80. (0xa1-0xdf). + */ +int +towide_mskanji(wchar_t *wc, const char *mb, unsigned n) +{ + wchar_t c; + + c = *(uint8_t *)mb; + + if ((c < 0x80) || ((c > 0xa0) && (c < 0xe0))) { + /* 7-bit */ + *wc = c; + return (1); + } + + if (n < 2) { + werr("incomplete character sequence (%s)", show_mb(mb)); + return (-1); + } + + /* Store both bytes as a single 16-bit wide. */ + c <<= 8; + c |= (uint8_t)(mb[1]); + *wc = c; + return (2); +} + +/* + * EUC forms. EUC encodings are "variable". FreeBSD carries some additional + * variable data to encode these, but we're going to treat each as independent + * instead. Its the only way we can sensibly move forward. + * + * Note that the way in which the different EUC forms vary is how wide + * CS2 and CS3 are and what the first byte of them is. + */ +static int +towide_euc_impl(wchar_t *wc, const char *mb, unsigned n, + uint8_t cs2, uint8_t cs2width, uint8_t cs3, uint8_t cs3width) +{ + int i; + int width = 2; + wchar_t c; + + c = *(uint8_t *)mb; + + /* + * All variations of EUC encode 7-bit ASCII as one byte, and use + * additional bytes for more than that. + */ + if ((c & 0x80) == 0) { + /* 7-bit */ + *wc = c; + return (1); + } + + /* + * All EUC variants reserve 0xa1-0xff to identify CS1, which + * is always two bytes wide. Note that unused CS will be zero, + * and that cannot be true because we know that the high order + * bit must be set. + */ + if (c >= 0xa1) { + width = 2; + } else if (c == cs2) { + width = cs2width; + } else if (c == cs3) { + width = cs3width; + } + + if ((int)n < width) { + werr("incomplete character sequence (%s)", show_mb(mb)); + return (-1); + } + + for (i = 1; i < width; i++) { + /* pull in the next byte */ + c <<= 8; + c |= (uint8_t)(mb[i]); + } + + *wc = c; + return (width); +} + +#pragma GCC diagnostic pop + +/* + * EUC-CN encodes as follows: + * + * Code set 0 (ASCII): 0x21-0x7E + * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE + * Code set 2: unused + * Code set 3: unused + */ +int +towide_euccn(wchar_t *wc, const char *mb, unsigned n) +{ + return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); +} + +/* + * EUC-JP encodes as follows: + * + * Code set 0 (ASCII or JIS X 0201-1976 Roman): 0x21-0x7E + * Code set 1 (JIS X 0208): 0xA1A1-0xFEFE + * Code set 2 (half-width katakana): 0x8EA1-0x8EDF + * Code set 3 (JIS X 0212-1990): 0x8FA1A1-0x8FFEFE + */ +int +towide_eucjp(wchar_t *wc, const char *mb, unsigned n) +{ + return (towide_euc_impl(wc, mb, n, 0x8e, 2, 0x8f, 3)); +} + +/* + * EUC-KR encodes as follows: + * + * Code set 0 (ASCII or KS C 5636-1993): 0x21-0x7E + * Code set 1 (KS C 5601-1992): 0xA1A1-0xFEFE + * Code set 2: unused + * Code set 3: unused + */ +int +towide_euckr(wchar_t *wc, const char *mb, unsigned n) +{ + return (towide_euc_impl(wc, mb, n, 0, 0, 0, 0)); +} + +/* + * EUC-TW encodes as follows: + * + * Code set 0 (ASCII): 0x21-0x7E + * Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE + * Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE + * Code set 3: unused + */ +int +towide_euctw(wchar_t *wc, const char *mb, unsigned n) +{ + return (towide_euc_impl(wc, mb, n, 0x8e, 4, 0, 0)); +} + +/* + * Public entry points. + */ + +int +to_wide(wchar_t *wc, const char *mb) +{ + /* this won't fail hard */ + return (_towide(wc, mb, strlen(mb))); +} + +int +to_mb(char *mb, wchar_t wc) +{ + int rv; + + if ((rv = _tomb(mb, wc)) < 0) { + errf(widemsg); + free(widemsg); + widemsg = NULL; + } + return (rv); +} + +char * +to_mb_string(const wchar_t *wcs) +{ + char *mbs; + char *ptr; + int len; + + mbs = malloc((wcslen(wcs) * mb_cur_max) + 1); + if (mbs == NULL) { + errf("out of memory"); + return (NULL); + } + ptr = mbs; + while (*wcs) { + if ((len = to_mb(ptr, *wcs)) < 0) { + INTERR; + free(mbs); + return (NULL); + } + wcs++; + ptr += len; + } + *ptr = 0; + return (mbs); +} + +void +set_wide_encoding(const char *encoding) +{ + int i; + + _towide = towide_none; + _tomb = tomb_none; + _encoding = "NONE"; + _nbits = 8; + + for (i = 0; mb_encodings[i].name; i++) { + if (strcasecmp(encoding, mb_encodings[i].name) == 0) { + _towide = mb_encodings[i].towide; + _tomb = mb_encodings[i].tomb; + _encoding = mb_encodings[i].cname; + _nbits = mb_encodings[i].nbits; + break; + } + } +} + +const char * +get_wide_encoding(void) +{ + return (_encoding); +} + +int +max_wide(void) +{ + return ((int)((1U << _nbits) - 1)); +}