057ca2d437
The localedef tool can read entire (and unmodified) CLDR posix definition files, and generate all 6 LC categories: LC_COLLATE, LC_CTYPE, LC_TIME, LC_NUMERIC, LC_MONETARY and LC_MESSAGES. This tool has a long history with Solaris. The Nexenta developers modified it to read CLDR files and created the much richer collation formats. The libc collation functions have to be modified to read the new format (called "BSD-1.0") and to handle the new data structures. The result will be that locale-sensitive tools and functions will now properly sort multibyte and unicode strings. Obtained from: Dragonfly
465 lines
11 KiB
C
465 lines
11 KiB
C
/*
|
|
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
|
|
* Copyright 2012 Garrett D'Amore <garrett@damore.org> All rights reserved.
|
|
* Copyright 2015 John Marino <draco@marino.st>
|
|
*
|
|
* This source code is derived from the illumos localedef command, and
|
|
* provided under BSD-style license terms by Nexenta Systems, Inc.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* LC_CTYPE database generation routines for localedef.
|
|
*/
|
|
#include <sys/cdefs.h>
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
#include <sys/avl.h>
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <stddef.h>
|
|
#include <string.h>
|
|
#include <sys/types.h>
|
|
#include <wchar.h>
|
|
#include <ctype.h>
|
|
#include <wctype.h>
|
|
#include <unistd.h>
|
|
#include "localedef.h"
|
|
#include "parser.h"
|
|
#include "runefile.h"
|
|
|
|
|
|
#define _ISUPPER _CTYPE_U
|
|
#define _ISLOWER _CTYPE_L
|
|
#define _ISDIGIT _CTYPE_D
|
|
#define _ISXDIGIT _CTYPE_X
|
|
#define _ISSPACE _CTYPE_S
|
|
#define _ISBLANK _CTYPE_B
|
|
#define _ISALPHA _CTYPE_A
|
|
#define _ISPUNCT _CTYPE_P
|
|
#define _ISGRAPH _CTYPE_G
|
|
#define _ISPRINT _CTYPE_R
|
|
#define _ISCNTRL _CTYPE_C
|
|
#define _E1 _CTYPE_Q
|
|
#define _E2 _CTYPE_I
|
|
#define _E3 0
|
|
#define _E4 0
|
|
#define _E5 _CTYPE_T
|
|
|
|
static avl_tree_t ctypes;
|
|
|
|
static wchar_t last_ctype;
|
|
|
|
typedef struct ctype_node {
|
|
wchar_t wc;
|
|
int32_t ctype;
|
|
int32_t toupper;
|
|
int32_t tolower;
|
|
avl_node_t avl;
|
|
} ctype_node_t;
|
|
|
|
typedef struct width_node {
|
|
wchar_t start;
|
|
wchar_t end;
|
|
int8_t width;
|
|
avl_node_t avl;
|
|
} width_node_t;
|
|
|
|
static int
|
|
ctype_compare(const void *n1, const void *n2)
|
|
{
|
|
const ctype_node_t *c1 = n1;
|
|
const ctype_node_t *c2 = n2;
|
|
|
|
return (c1->wc < c2->wc ? -1 : c1->wc > c2->wc ? 1 : 0);
|
|
}
|
|
|
|
void
|
|
init_ctype(void)
|
|
{
|
|
avl_create(&ctypes, ctype_compare, sizeof (ctype_node_t),
|
|
offsetof(ctype_node_t, avl));
|
|
}
|
|
|
|
|
|
static void
|
|
add_ctype_impl(ctype_node_t *ctn)
|
|
{
|
|
switch (last_kw) {
|
|
case T_ISUPPER:
|
|
ctn->ctype |= (_ISUPPER | _ISALPHA | _ISGRAPH | _ISPRINT);
|
|
break;
|
|
case T_ISLOWER:
|
|
ctn->ctype |= (_ISLOWER | _ISALPHA | _ISGRAPH | _ISPRINT);
|
|
break;
|
|
case T_ISALPHA:
|
|
ctn->ctype |= (_ISALPHA | _ISGRAPH | _ISPRINT);
|
|
break;
|
|
case T_ISDIGIT:
|
|
ctn->ctype |= (_ISDIGIT | _ISGRAPH | _ISPRINT | _ISXDIGIT);
|
|
break;
|
|
case T_ISSPACE:
|
|
ctn->ctype |= _ISSPACE;
|
|
break;
|
|
case T_ISCNTRL:
|
|
ctn->ctype |= _ISCNTRL;
|
|
break;
|
|
case T_ISGRAPH:
|
|
ctn->ctype |= (_ISGRAPH | _ISPRINT);
|
|
break;
|
|
case T_ISPRINT:
|
|
ctn->ctype |= _ISPRINT;
|
|
break;
|
|
case T_ISPUNCT:
|
|
ctn->ctype |= (_ISPUNCT | _ISGRAPH | _ISPRINT);
|
|
break;
|
|
case T_ISXDIGIT:
|
|
ctn->ctype |= (_ISXDIGIT | _ISPRINT);
|
|
break;
|
|
case T_ISBLANK:
|
|
ctn->ctype |= (_ISBLANK | _ISSPACE);
|
|
break;
|
|
case T_ISPHONOGRAM:
|
|
ctn->ctype |= (_E1 | _ISPRINT | _ISGRAPH);
|
|
break;
|
|
case T_ISIDEOGRAM:
|
|
ctn->ctype |= (_E2 | _ISPRINT | _ISGRAPH);
|
|
break;
|
|
case T_ISENGLISH:
|
|
ctn->ctype |= (_E3 | _ISPRINT | _ISGRAPH);
|
|
break;
|
|
case T_ISNUMBER:
|
|
ctn->ctype |= (_E4 | _ISPRINT | _ISGRAPH);
|
|
break;
|
|
case T_ISSPECIAL:
|
|
ctn->ctype |= (_E5 | _ISPRINT | _ISGRAPH);
|
|
break;
|
|
case T_ISALNUM:
|
|
/*
|
|
* We can't do anything with this. The character
|
|
* should already be specified as a digit or alpha.
|
|
*/
|
|
break;
|
|
default:
|
|
errf("not a valid character class");
|
|
}
|
|
}
|
|
|
|
static ctype_node_t *
|
|
get_ctype(wchar_t wc)
|
|
{
|
|
ctype_node_t srch;
|
|
ctype_node_t *ctn;
|
|
avl_index_t where;
|
|
|
|
srch.wc = wc;
|
|
if ((ctn = avl_find(&ctypes, &srch, &where)) == NULL) {
|
|
if ((ctn = calloc(1, sizeof (*ctn))) == NULL) {
|
|
errf("out of memory");
|
|
return (NULL);
|
|
}
|
|
ctn->wc = wc;
|
|
|
|
avl_insert(&ctypes, ctn, where);
|
|
}
|
|
return (ctn);
|
|
}
|
|
|
|
void
|
|
add_ctype(int val)
|
|
{
|
|
ctype_node_t *ctn;
|
|
|
|
if ((ctn = get_ctype(val)) == NULL) {
|
|
INTERR;
|
|
return;
|
|
}
|
|
add_ctype_impl(ctn);
|
|
last_ctype = ctn->wc;
|
|
}
|
|
|
|
void
|
|
add_ctype_range(int end)
|
|
{
|
|
ctype_node_t *ctn;
|
|
wchar_t cur;
|
|
|
|
if (end < last_ctype) {
|
|
errf("malformed character range (%u ... %u))",
|
|
last_ctype, end);
|
|
return;
|
|
}
|
|
for (cur = last_ctype + 1; cur <= end; cur++) {
|
|
if ((ctn = get_ctype(cur)) == NULL) {
|
|
INTERR;
|
|
return;
|
|
}
|
|
add_ctype_impl(ctn);
|
|
}
|
|
last_ctype = end;
|
|
|
|
}
|
|
|
|
/*
|
|
* A word about widths: if the width mask is specified, then libc
|
|
* unconditionally honors it. Otherwise, it assumes printable
|
|
* characters have width 1, and non-printable characters have width
|
|
* -1 (except for NULL which is special with with 0). Hence, we have
|
|
* no need to inject defaults here -- the "default" unset value of 0
|
|
* indicates that libc should use its own logic in wcwidth as described.
|
|
*/
|
|
void
|
|
add_width(int wc, int width)
|
|
{
|
|
ctype_node_t *ctn;
|
|
|
|
if ((ctn = get_ctype(wc)) == NULL) {
|
|
INTERR;
|
|
return;
|
|
}
|
|
ctn->ctype &= ~(_CTYPE_SWM);
|
|
switch (width) {
|
|
case 0:
|
|
ctn->ctype |= _CTYPE_SW0;
|
|
break;
|
|
case 1:
|
|
ctn->ctype |= _CTYPE_SW1;
|
|
break;
|
|
case 2:
|
|
ctn->ctype |= _CTYPE_SW2;
|
|
break;
|
|
case 3:
|
|
ctn->ctype |= _CTYPE_SW3;
|
|
break;
|
|
}
|
|
}
|
|
|
|
void
|
|
add_width_range(int start, int end, int width)
|
|
{
|
|
for (; start <= end; start++) {
|
|
add_width(start, width);
|
|
}
|
|
}
|
|
|
|
void
|
|
add_caseconv(int val, int wc)
|
|
{
|
|
ctype_node_t *ctn;
|
|
|
|
ctn = get_ctype(val);
|
|
if (ctn == NULL) {
|
|
INTERR;
|
|
return;
|
|
}
|
|
|
|
switch (last_kw) {
|
|
case T_TOUPPER:
|
|
ctn->toupper = wc;
|
|
break;
|
|
case T_TOLOWER:
|
|
ctn->tolower = wc;
|
|
break;
|
|
default:
|
|
INTERR;
|
|
break;
|
|
}
|
|
}
|
|
|
|
void
|
|
dump_ctype(void)
|
|
{
|
|
FILE *f;
|
|
_FileRuneLocale rl;
|
|
ctype_node_t *ctn, *last_ct, *last_lo, *last_up;
|
|
_FileRuneEntry *ct = NULL;
|
|
_FileRuneEntry *lo = NULL;
|
|
_FileRuneEntry *up = NULL;
|
|
wchar_t wc;
|
|
|
|
(void) memset(&rl, 0, sizeof (rl));
|
|
last_ct = NULL;
|
|
last_lo = NULL;
|
|
last_up = NULL;
|
|
|
|
if ((f = open_category()) == NULL)
|
|
return;
|
|
|
|
(void) memcpy(rl.magic, _FILE_RUNE_MAGIC_1, 8);
|
|
(void) strncpy(rl.encoding, get_wide_encoding(), sizeof (rl.encoding));
|
|
|
|
/*
|
|
* Initialize the identity map.
|
|
*/
|
|
for (wc = 0; (unsigned)wc < _CACHED_RUNES; wc++) {
|
|
rl.maplower[wc] = wc;
|
|
rl.mapupper[wc] = wc;
|
|
}
|
|
|
|
for (ctn = avl_first(&ctypes); ctn; ctn = AVL_NEXT(&ctypes, ctn)) {
|
|
int conflict = 0;
|
|
|
|
|
|
wc = ctn->wc;
|
|
|
|
/*
|
|
* POSIX requires certain portable characters have
|
|
* certain types. Add them if they are missing.
|
|
*/
|
|
if ((wc >= 1) && (wc <= 127)) {
|
|
if ((wc >= 'A') && (wc <= 'Z'))
|
|
ctn->ctype |= _ISUPPER;
|
|
if ((wc >= 'a') && (wc <= 'z'))
|
|
ctn->ctype |= _ISLOWER;
|
|
if ((wc >= '0') && (wc <= '9'))
|
|
ctn->ctype |= _ISDIGIT;
|
|
if (strchr(" \f\n\r\t\v", (char)wc) != NULL)
|
|
ctn->ctype |= _ISSPACE;
|
|
if (strchr("0123456789ABCDEFabcdef", (char)wc) != NULL)
|
|
ctn->ctype |= _ISXDIGIT;
|
|
if (strchr(" \t", (char)wc))
|
|
ctn->ctype |= _ISBLANK;
|
|
|
|
/*
|
|
* Technically these settings are only
|
|
* required for the C locale. However, it
|
|
* turns out that because of the historical
|
|
* version of isprint(), we need them for all
|
|
* locales as well. Note that these are not
|
|
* necessarily valid punctation characters in
|
|
* the current language, but ispunct() needs
|
|
* to return TRUE for them.
|
|
*/
|
|
if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
|
|
(char)wc))
|
|
ctn->ctype |= _ISPUNCT;
|
|
}
|
|
|
|
/*
|
|
* POSIX also requires that certain types imply
|
|
* others. Add any inferred types here.
|
|
*/
|
|
if (ctn->ctype & (_ISUPPER |_ISLOWER))
|
|
ctn->ctype |= _ISALPHA;
|
|
if (ctn->ctype & _ISDIGIT)
|
|
ctn->ctype |= _ISXDIGIT;
|
|
if (ctn->ctype & _ISBLANK)
|
|
ctn->ctype |= _ISSPACE;
|
|
if (ctn->ctype & (_ISALPHA|_ISDIGIT|_ISXDIGIT))
|
|
ctn->ctype |= _ISGRAPH;
|
|
if (ctn->ctype & _ISGRAPH)
|
|
ctn->ctype |= _ISPRINT;
|
|
|
|
/*
|
|
* Finally, POSIX requires that certain combinations
|
|
* are invalid. We don't flag this as a fatal error,
|
|
* but we will warn about.
|
|
*/
|
|
if ((ctn->ctype & _ISALPHA) &&
|
|
(ctn->ctype & (_ISPUNCT|_ISDIGIT)))
|
|
conflict++;
|
|
if ((ctn->ctype & _ISPUNCT) &
|
|
(ctn->ctype & (_ISDIGIT|_ISALPHA|_ISXDIGIT)))
|
|
conflict++;
|
|
if ((ctn->ctype & _ISSPACE) && (ctn->ctype & _ISGRAPH))
|
|
conflict++;
|
|
if ((ctn->ctype & _ISCNTRL) & _ISPRINT)
|
|
conflict++;
|
|
if ((wc == ' ') && (ctn->ctype & (_ISPUNCT|_ISGRAPH)))
|
|
conflict++;
|
|
|
|
if (conflict) {
|
|
warn("conflicting classes for character 0x%x (%x)",
|
|
wc, ctn->ctype);
|
|
}
|
|
/*
|
|
* Handle the lower 256 characters using the simple
|
|
* optimization. Note that if we have not defined the
|
|
* upper/lower case, then we identity map it.
|
|
*/
|
|
if ((unsigned)wc < _CACHED_RUNES) {
|
|
rl.runetype[wc] = ctn->ctype;
|
|
if (ctn->tolower)
|
|
rl.maplower[wc] = ctn->tolower;
|
|
if (ctn->toupper)
|
|
rl.mapupper[wc] = ctn->toupper;
|
|
continue;
|
|
}
|
|
|
|
if ((last_ct != NULL) && (last_ct->ctype == ctn->ctype)) {
|
|
ct[rl.runetype_ext_nranges-1].max = wc;
|
|
last_ct = ctn;
|
|
} else {
|
|
rl.runetype_ext_nranges++;
|
|
ct = realloc(ct,
|
|
sizeof (*ct) * rl.runetype_ext_nranges);
|
|
ct[rl.runetype_ext_nranges - 1].min = wc;
|
|
ct[rl.runetype_ext_nranges - 1].max = wc;
|
|
ct[rl.runetype_ext_nranges - 1].map = ctn->ctype;
|
|
last_ct = ctn;
|
|
}
|
|
if (ctn->tolower == 0) {
|
|
last_lo = NULL;
|
|
} else if ((last_lo != NULL) &&
|
|
(last_lo->tolower + 1 == ctn->tolower)) {
|
|
lo[rl.maplower_ext_nranges-1].max = wc;
|
|
last_lo = ctn;
|
|
} else {
|
|
rl.maplower_ext_nranges++;
|
|
lo = realloc(lo,
|
|
sizeof (*lo) * rl.maplower_ext_nranges);
|
|
lo[rl.maplower_ext_nranges - 1].min = wc;
|
|
lo[rl.maplower_ext_nranges - 1].max = wc;
|
|
lo[rl.maplower_ext_nranges - 1].map = ctn->tolower;
|
|
last_lo = ctn;
|
|
}
|
|
|
|
if (ctn->toupper == 0) {
|
|
last_up = NULL;
|
|
} else if ((last_up != NULL) &&
|
|
(last_up->toupper + 1 == ctn->toupper)) {
|
|
up[rl.mapupper_ext_nranges-1].max = wc;
|
|
last_up = ctn;
|
|
} else {
|
|
rl.mapupper_ext_nranges++;
|
|
up = realloc(up,
|
|
sizeof (*up) * rl.mapupper_ext_nranges);
|
|
up[rl.mapupper_ext_nranges - 1].min = wc;
|
|
up[rl.mapupper_ext_nranges - 1].max = wc;
|
|
up[rl.mapupper_ext_nranges - 1].map = ctn->toupper;
|
|
last_up = ctn;
|
|
}
|
|
}
|
|
|
|
if ((wr_category(&rl, sizeof (rl), f) < 0) ||
|
|
(wr_category(ct, sizeof (*ct) * rl.runetype_ext_nranges, f) < 0) ||
|
|
(wr_category(lo, sizeof (*lo) * rl.maplower_ext_nranges, f) < 0) ||
|
|
(wr_category(up, sizeof (*up) * rl.mapupper_ext_nranges, f) < 0)) {
|
|
return;
|
|
}
|
|
|
|
close_category(f);
|
|
}
|