Add support for multibyte characters. The challenge here was to use

data structures that scale better with large character sets, instead of
arrays indexed by character value:
- Sets of characters to delete/squeeze are stored in a new "cset" structure,
which is implemented as a splay tree of extents. This structure has the
ability to store character classes (ala wctype(3)), but this is not
currently fully utilized.
- Mappings between characters are stored in a new "cmap" structure, which
is also a splay tree.
- The parser no longer builds arrays containing all the characters in a
particular class; instead, next() determines them on-the-fly using
nextwctype(3).
This commit is contained in:
tjr 2004-07-09 02:08:07 +00:00
parent fb654efba8
commit d291df1e3f
8 changed files with 879 additions and 199 deletions

View File

@ -1,6 +1,7 @@
# @(#)Makefile 8.1 (Berkeley) 6/6/93
# $FreeBSD$
PROG= tr
SRCS= str.c tr.c
SRCS= cmap.c cset.c str.c tr.c
.include <bsd.prog.mk>

212
usr.bin/tr/cmap.c Normal file
View File

@ -0,0 +1,212 @@
/*-
* Copyright (c) 2004 Tim J. Robbins.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* "Character map" ADT. Stores mappings between pairs of characters in a
* splay tree, with a lookup table cache to simplify looking up the first
* bunch of characters (which are presumably more common than others).
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <assert.h>
#include <limits.h>
#include <stdbool.h>
#include <stdlib.h>
#include <wchar.h>
#include "cmap.h"
static struct cmapnode *cmap_splay(struct cmapnode *, wint_t);
/*
* cmap_alloc --
* Allocate a character map.
*/
struct cmap *
cmap_alloc(void)
{
struct cmap *cm;
cm = malloc(sizeof(*cm));
if (cm == NULL)
return (NULL);
cm->cm_root = NULL;
cm->cm_def = CM_DEF_SELF;
cm->cm_havecache = false;
cm->cm_min = cm->cm_max = 0;
return (cm);
}
/*
* cmap_add --
* Add a mapping from "from" to "to" to the map.
*/
bool
cmap_add(struct cmap *cm, wint_t from, wint_t to)
{
struct cmapnode *cmn, *ncmn;
cm->cm_havecache = false;
if (cm->cm_root == NULL) {
cmn = malloc(sizeof(*cmn));
if (cmn == NULL)
return (false);
cmn->cmn_from = from;
cmn->cmn_to = to;
cmn->cmn_left = cmn->cmn_right = NULL;
cm->cm_root = cmn;
cm->cm_min = cm->cm_max = from;
return (true);
}
cmn = cm->cm_root = cmap_splay(cm->cm_root, from);
if (cmn->cmn_from == from) {
cmn->cmn_to = to;
return (true);
}
ncmn = malloc(sizeof(*ncmn));
if (ncmn == NULL)
return (false);
ncmn->cmn_from = from;
ncmn->cmn_to = to;
if (from < cmn->cmn_from) {
ncmn->cmn_left = cmn->cmn_left;
ncmn->cmn_right = cmn;
cmn->cmn_left = NULL;
} else {
ncmn->cmn_right = cmn->cmn_right;
ncmn->cmn_left = cmn;
cmn->cmn_right = NULL;
}
if (from < cm->cm_min)
cm->cm_min = from;
if (from > cm->cm_max)
cm->cm_max = from;
cm->cm_root = ncmn;
return (true);
}
/*
* cmap_lookup_hard --
* Look up the mapping for a character using the cache.
*/
wint_t
cmap_lookup_hard(struct cmap *cm, wint_t ch)
{
if (cm->cm_root != NULL) {
cm->cm_root = cmap_splay(cm->cm_root, ch);
if (cm->cm_root->cmn_from == ch)
return (cm->cm_root->cmn_to);
}
return (cm->cm_def == CM_DEF_SELF ? ch : cm->cm_def);
}
/*
* cmap_cache --
* Update the cache.
*/
void
cmap_cache(struct cmap *cm)
{
wint_t ch;
for (ch = 0; ch < CM_CACHE_SIZE; ch++)
cm->cm_cache[ch] = cmap_lookup_hard(cm, ch);
cm->cm_havecache = true;
}
/*
* cmap_default --
* Change the value that characters without mappings map to, and
* return the old value. The special character value CM_MAP_SELF
* means characters map to themselves.
*/
wint_t
cmap_default(struct cmap *cm, wint_t def)
{
wint_t old;
old = cm->cm_def;
cm->cm_def = def;
cm->cm_havecache = false;
return (old);
}
static struct cmapnode *
cmap_splay(struct cmapnode *t, wint_t ch)
{
struct cmapnode N, *l, *r, *y;
/*
* Based on public domain code from Sleator.
*/
assert(t != NULL);
N.cmn_left = N.cmn_right = NULL;
l = r = &N;
for (;;) {
if (ch < t->cmn_from) {
if (t->cmn_left != NULL &&
ch < t->cmn_left->cmn_from) {
y = t->cmn_left;
t->cmn_left = y->cmn_right;
y->cmn_right = t;
t = y;
}
if (t->cmn_left == NULL)
break;
r->cmn_left = t;
r = t;
t = t->cmn_left;
} else if (ch > t->cmn_from) {
if (t->cmn_right != NULL &&
ch > t->cmn_right->cmn_from) {
y = t->cmn_right;
t->cmn_right = y->cmn_left;
y->cmn_left = t;
t = y;
}
if (t->cmn_right == NULL)
break;
l->cmn_right = t;
l = t;
t = t->cmn_right;
} else
break;
}
l->cmn_right = t->cmn_left;
r->cmn_left = t->cmn_right;
t->cmn_left = N.cmn_right;
t->cmn_right = N.cmn_left;
return (t);
}

83
usr.bin/tr/cmap.h Normal file
View File

@ -0,0 +1,83 @@
/*-
* Copyright (c) 2004 Tim J. Robbins.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef CMAP_H
#define CMAP_H
#include <limits.h>
#include <stdbool.h>
#include <wchar.h>
struct cmapnode {
wint_t cmn_from;
wint_t cmn_to;
struct cmapnode *cmn_left;
struct cmapnode *cmn_right;
};
struct cmap {
#define CM_CACHE_SIZE 128
wint_t cm_cache[CM_CACHE_SIZE];
bool cm_havecache;
struct cmapnode *cm_root;
#define CM_DEF_SELF -2
wint_t cm_def;
wint_t cm_min;
wint_t cm_max;
};
struct cmap * cmap_alloc(void);
bool cmap_add(struct cmap *, wint_t, wint_t);
wint_t cmap_lookup_hard(struct cmap *, wint_t);
void cmap_cache(struct cmap *);
wint_t cmap_default(struct cmap *, wint_t);
static __inline wint_t
cmap_lookup(struct cmap *cm, wint_t from)
{
if (from < CM_CACHE_SIZE && cm->cm_havecache)
return (cm->cm_cache[from]);
return (cmap_lookup_hard(cm, from));
}
static __inline wint_t
cmap_min(struct cmap *cm)
{
return (cm->cm_min);
}
static __inline wint_t
cmap_max(struct cmap *cm)
{
return (cm->cm_max);
}
#endif

303
usr.bin/tr/cset.c Normal file
View File

@ -0,0 +1,303 @@
/*-
* Copyright (c) 2004 Tim J. Robbins.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* "Set of characters" ADT implemented as a splay tree of extents, with
* a lookup table cache to simplify looking up the first bunch of
* characters (which are presumably more common than others).
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <assert.h>
#include <stdbool.h>
#include <stdlib.h>
#include <wchar.h>
#include <wctype.h>
#include "cset.h"
static struct csnode * cset_delete(struct csnode *, wchar_t);
static __inline int cset_rangecmp(struct csnode *, wchar_t);
static struct csnode * cset_splay(struct csnode *, wchar_t);
/*
* cset_alloc --
* Allocate a set of characters.
*/
struct cset *
cset_alloc(void)
{
struct cset *cs;
if ((cs = malloc(sizeof(*cs))) == NULL)
return (NULL);
cs->cs_root = NULL;
cs->cs_classes = NULL;
cs->cs_havecache = false;
return (cs);
}
/*
* cset_add --
* Add a character to the set.
*/
bool
cset_add(struct cset *cs, wchar_t ch)
{
struct csnode *csn, *ncsn;
wchar_t oval;
cs->cs_havecache = false;
/*
* Inserting into empty tree; new item becomes the root.
*/
if (cs->cs_root == NULL) {
csn = malloc(sizeof(*cs->cs_root));
if (csn == NULL)
return (false);
csn->csn_left = csn->csn_right = NULL;
csn->csn_min = csn->csn_max = ch;
cs->cs_root = csn;
return (true);
}
/*
* Splay to check whether the item already exists, and otherwise,
* where we should put it.
*/
csn = cs->cs_root = cset_splay(cs->cs_root, ch);
/*
* Easy cases where we can avoid allocating a new node:
* (a) node already exists.
* (b) we can lower the extent's "min" to accomodate this
* character without having to coalesce.
* (c) we can raise the extent's "max" without having
* to coalesce.
*/
if (cset_rangecmp(csn, ch) == 0)
return (true);
if (ch + 1 == csn->csn_min && (csn->csn_left == NULL ||
ch > csn->csn_left->csn_max + 1)) {
csn->csn_min--;
return (true);
}
if (ch == csn->csn_max + 1 && (csn->csn_right == NULL ||
ch + 1 < csn->csn_right->csn_min)) {
csn->csn_max++;
return (true);
}
/*
* Allocate a new node and link it into the tree as a direct
* child of the root.
*/
ncsn = malloc(sizeof(*ncsn));
if (ncsn == NULL)
return (false);
ncsn->csn_min = ncsn->csn_max = ch;
if (cset_rangecmp(csn, ch) < 0) {
ncsn->csn_left = csn->csn_left;
ncsn->csn_right = csn;
csn->csn_left = NULL;
} else {
ncsn->csn_right = csn->csn_right;
ncsn->csn_left = csn;
csn->csn_right = NULL;
}
cs->cs_root = ncsn;
/*
* Splay to bring the newly inserted node to the root, then
* coalesce with left and right neighbours if possible.
*/
csn = cs->cs_root = cset_splay(cs->cs_root, ch);
if (csn->csn_left != NULL &&
csn->csn_left->csn_max + 1 == csn->csn_min) {
oval = csn->csn_left->csn_min;
cs->cs_root = cset_delete(cs->cs_root,
csn->csn_left->csn_min);
ncsn->csn_min = oval;
}
csn = cs->cs_root = cset_splay(cs->cs_root, ch);
if (csn->csn_right != NULL &&
csn->csn_right->csn_min - 1 == csn->csn_max) {
oval = csn->csn_right->csn_max;
cs->cs_root = cset_delete(cs->cs_root,
csn->csn_right->csn_min);
ncsn->csn_max = oval;
}
return (true);
}
/*
* cset_in_hard --
* Determine whether a character is in the set without using
* the cache.
*/
bool
cset_in_hard(struct cset *cs, wchar_t ch)
{
struct csclass *csc;
for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
if (csc->csc_invert ^ iswctype(ch, csc->csc_type) != 0)
return (cs->cs_invert ^ true);
if (cs->cs_root != NULL) {
cs->cs_root = cset_splay(cs->cs_root, ch);
return (cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch) == 0);
}
return (cs->cs_invert ^ false);
}
/*
* cset_cache --
* Update the cache.
*/
void
cset_cache(struct cset *cs)
{
wchar_t i;
for (i = 0; i < CS_CACHE_SIZE; i++)
cs->cs_cache[i] = cset_in_hard(cs, i);
cs->cs_havecache = true;
}
/*
* cset_invert --
* Invert the character set.
*/
void
cset_invert(struct cset *cs)
{
cs->cs_invert ^= true;
cs->cs_havecache = false;
}
/*
* cset_addclass --
* Add a wctype()-style character class to the set, optionally
* inverting it.
*/
bool
cset_addclass(struct cset *cs, wctype_t type, bool invert)
{
struct csclass *csc;
csc = malloc(sizeof(*csc));
if (csc == NULL)
return (false);
csc->csc_type = type;
csc->csc_invert = invert;
csc->csc_next = cs->cs_classes;
cs->cs_classes = csc;
cs->cs_havecache = false;
return (true);
}
static __inline int
cset_rangecmp(struct csnode *t, wchar_t ch)
{
if (ch < t->csn_min)
return (-1);
if (ch > t->csn_max)
return (1);
return (0);
}
static struct csnode *
cset_splay(struct csnode *t, wchar_t ch)
{
struct csnode N, *l, *r, *y;
/*
* Based on public domain code from Sleator.
*/
assert(t != NULL);
N.csn_left = N.csn_right = NULL;
l = r = &N;
for (;;) {
if (cset_rangecmp(t, ch) < 0) {
if (t->csn_left != NULL &&
cset_rangecmp(t->csn_left, ch) < 0) {
y = t->csn_left;
t->csn_left = y->csn_right;
y->csn_right = t;
t = y;
}
if (t->csn_left == NULL)
break;
r->csn_left = t;
r = t;
t = t->csn_left;
} else if (cset_rangecmp(t, ch) > 0) {
if (t->csn_right != NULL &&
cset_rangecmp(t->csn_right, ch) > 0) {
y = t->csn_right;
t->csn_right = y->csn_left;
y->csn_left = t;
t = y;
}
if (t->csn_right == NULL)
break;
l->csn_right = t;
l = t;
t = t->csn_right;
} else
break;
}
l->csn_right = t->csn_left;
r->csn_left = t->csn_right;
t->csn_left = N.csn_right;
t->csn_right = N.csn_left;
return (t);
}
static struct csnode *
cset_delete(struct csnode *t, wchar_t ch)
{
struct csnode *x;
assert(t != NULL);
t = cset_splay(t, ch);
assert(cset_rangecmp(t, ch) == 0);
if (t->csn_left == NULL)
x = t->csn_right;
else {
x = cset_splay(t->csn_left, ch);
x->csn_right = t->csn_right;
}
free(t);
return x;
}

75
usr.bin/tr/cset.h Normal file
View File

@ -0,0 +1,75 @@
/*-
* Copyright (c) 2004 Tim J. Robbins.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef CSET_H
#define CSET_H
#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
struct csnode {
wchar_t csn_min;
wchar_t csn_max;
struct csnode *csn_left;
struct csnode *csn_right;
};
struct csclass {
wctype_t csc_type;
bool csc_invert;
bool csc_value;
struct csclass *csc_next;
};
struct cset {
#define CS_CACHE_SIZE 256
bool cs_cache[CS_CACHE_SIZE];
bool cs_havecache;
struct csclass *cs_classes;
struct csnode *cs_root;
bool cs_invert;
};
bool cset_addclass(struct cset *, wctype_t, bool);
struct cset * cset_alloc(void);
bool cset_add(struct cset *, wchar_t);
void cset_invert(struct cset *);
bool cset_in_hard(struct cset *, wchar_t);
void cset_cache(struct cset *);
static __inline bool
cset_in(struct cset *cs, wchar_t ch)
{
if (ch < CS_CACHE_SIZE && cs->cs_havecache)
return (cs->cs_cache[ch]);
return (cset_in_hard(cs, ch));
}
#endif /* CSET_H */

View File

@ -35,20 +35,21 @@
*/
#include <limits.h>
#define NCHARS (UCHAR_MAX + 1) /* Number of possible characters. */
#define OOBCH (UCHAR_MAX + 1) /* Out of band character value. */
#define NCHARS_SB (UCHAR_MAX + 1) /* Number of single-byte characters. */
#define OOBCH -1 /* Out of band character value. */
typedef struct {
enum { STRING1, STRING2 } which;
enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE,
SET, SET_UPPER, SET_LOWER } state;
int cnt; /* character count */
int lastch; /* last character */
int equiv[NCHARS]; /* equivalence set */
int *set; /* set of characters */
char *str; /* user's string */
CCLASS, CCLASS_UPPER, CCLASS_LOWER, SET } state;
int cnt; /* character count */
wint_t lastch; /* last character */
wctype_t cclass; /* character class from wctype() */
wint_t equiv[NCHARS_SB]; /* equivalence set */
wint_t *set; /* set of characters */
char *str; /* user's string */
} STR;
int next(STR *);
wint_t next(STR *);
int charcoll(const void *, const void *);

View File

@ -44,26 +44,31 @@ static const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95";
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
#include <wctype.h>
#include "extern.h"
static int backslash(STR *, int *);
static int bracket(STR *);
static int c_class(const void *, const void *);
static void genclass(STR *);
static void genequiv(STR *);
static int genrange(STR *, int);
static void genseq(STR *);
int
wint_t
next(s)
STR *s;
{
int ch, is_octal;
int is_octal;
wint_t ch;
wchar_t wch;
size_t clen;
switch (s->state) {
case EOS:
@ -71,7 +76,7 @@ next(s)
case INFINITE:
return (1);
case NORMAL:
switch (ch = (u_char)*s->str) {
switch (*s->str) {
case '\0':
s->state = EOS;
return (0);
@ -83,9 +88,13 @@ next(s)
return (next(s));
/* FALLTHROUGH */
default:
clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
if (clen == (size_t)-1 || clen == (size_t)-2 ||
clen == 0)
errc(1, EILSEQ, NULL);
is_octal = 0;
++s->str;
s->lastch = ch;
s->lastch = wch;
s->str += clen;
break;
}
@ -106,9 +115,18 @@ next(s)
return (next(s));
}
return (1);
case CCLASS:
case CCLASS_UPPER:
case CCLASS_LOWER:
s->cnt++;
ch = nextwctype(s->lastch, s->cclass);
if (ch == -1) {
s->state = NORMAL;
return (next(s));
}
s->lastch = ch;
return (1);
case SET:
case SET_UPPER:
case SET_LOWER:
if ((ch = s->set[s->cnt++]) == OOBCH) {
s->state = NORMAL;
return (next(s));
@ -159,74 +177,21 @@ bracket(s)
/* NOTREACHED */
}
typedef struct {
const char *name;
int (*func)(int);
int *set;
} CLASS;
static CLASS classes[] = {
#undef isalnum
{ "alnum", isalnum, NULL },
#undef isalpha
{ "alpha", isalpha, NULL },
#undef isblank
{ "blank", isblank, NULL },
#undef iscntrl
{ "cntrl", iscntrl, NULL },
#undef isdigit
{ "digit", isdigit, NULL },
#undef isgraph
{ "graph", isgraph, NULL },
#undef islower
{ "lower", islower, NULL },
#undef isprint
{ "print", isprint, NULL },
#undef ispunct
{ "punct", ispunct, NULL },
#undef isspace
{ "space", isspace, NULL },
#undef isupper
{ "upper", isupper, NULL },
#undef isxdigit
{ "xdigit", isxdigit, NULL },
};
static void
genclass(s)
STR *s;
{
int cnt, (*func)(int);
CLASS *cp, tmp;
int *p;
tmp.name = s->str;
if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
if ((s->cclass = wctype(s->str)) == 0)
errx(1, "unknown class %s", s->str);
if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
err(1, "genclass() malloc");
for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
if ((func)(cnt))
*p++ = cnt;
*p = OOBCH;
s->cnt = 0;
s->set = cp->set;
s->lastch = -1; /* incremented before check in next() */
if (strcmp(s->str, "upper") == 0)
s->state = SET_UPPER;
s->state = CCLASS_UPPER;
else if (strcmp(s->str, "lower") == 0)
s->state = SET_LOWER;
s->state = CCLASS_LOWER;
else
s->state = SET;
}
static int
c_class(a, b)
const void *a, *b;
{
return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name));
s->state = CCLASS;
}
static void
@ -235,6 +200,8 @@ genequiv(s)
{
int i, p, pri;
char src[2], dst[3];
size_t clen;
wchar_t wc;
if (*s->str == '\\') {
s->equiv[0] = backslash(s, NULL);
@ -242,10 +209,13 @@ genequiv(s)
errx(1, "misplaced equivalence equals sign");
s->str += 2;
} else {
s->equiv[0] = s->str[0];
if (s->str[1] != '=')
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
errc(1, EILSEQ, NULL);
s->equiv[0] = wc;
if (s->str[clen] != '=')
errx(1, "misplaced equivalence equals sign");
s->str += 3;
s->str += clen + 2;
}
/*
@ -255,12 +225,13 @@ genequiv(s)
* XXX Knows too much about how strxfrm() is implemented. Assumes
* it fills the string with primary collation weight bytes. Only one-
* to-one mappings are supported.
* XXX Equivalence classes not supported in multibyte locales.
*/
src[0] = s->equiv[0];
src[0] = (char)s->equiv[0];
src[1] = '\0';
if (strxfrm(dst, src, sizeof(dst)) == 1) {
if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
pri = (unsigned char)*dst;
for (p = 1, i = 1; i < NCHARS; i++) {
for (p = 1, i = 1; i < NCHARS_SB; i++) {
*src = i;
if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
pri == (unsigned char)*dst)
@ -280,28 +251,41 @@ genrange(STR *s, int was_octal)
int stopval, octal;
char *savestart;
int n, cnt, *p;
size_t clen;
wchar_t wc;
octal = 0;
savestart = s->str;
stopval = *++s->str == '\\' ? backslash(s, &octal) : (u_char)*s->str++;
if (!octal)
octal = was_octal;
if ((octal && stopval < s->lastch) ||
(!octal &&
charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0)) {
s->str = savestart;
return (0);
if (*++s->str == '\\')
stopval = backslash(s, &octal);
else {
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
if (clen == (size_t)-1 || clen == (size_t)-2)
errc(1, EILSEQ, NULL);
stopval = wc;
s->str += clen;
}
if (octal) {
/*
* XXX Characters are not ordered according to collating sequence in
* multibyte locales.
*/
if (octal || was_octal || MB_CUR_MAX > 1) {
if (stopval < s->lastch) {
s->str = savestart;
return (0);
}
s->cnt = stopval - s->lastch + 1;
s->state = RANGE;
--s->lastch;
return (1);
}
if ((s->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
s->str = savestart;
return (0);
}
if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
err(1, "genrange() malloc");
for (cnt = 0; cnt < NCHARS; cnt++)
for (cnt = 0; cnt < NCHARS_SB; cnt++)
if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
*p++ = cnt;
@ -320,14 +304,21 @@ genseq(s)
STR *s;
{
char *ep;
wchar_t wc;
size_t clen;
if (s->which == STRING1)
errx(1, "sequences only valid in string2");
if (*s->str == '\\')
s->lastch = backslash(s, NULL);
else
s->lastch = *s->str++;
else {
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
if (clen == (size_t)-1 || clen == (size_t)-2)
errc(1, EILSEQ, NULL);
s->lastch = wc;
s->str += clen;
}
if (*s->str != '*')
errx(1, "misplaced sequence asterisk");

View File

@ -49,67 +49,34 @@ static const char sccsid[] = "@(#)tr.c 8.2 (Berkeley) 5/4/95";
#include <ctype.h>
#include <err.h>
#include <limits.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <wchar.h>
#include <wctype.h>
#include "cmap.h"
#include "cset.h"
#include "extern.h"
/*
* For -C option: determine whether a byte is a valid character in the
* current character set (as defined by LC_CTYPE).
*/
#define ISCHAR(c) (iscntrl(c) || isprint(c))
STR s1 = { STRING1, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL };
STR s2 = { STRING2, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL };
static int string1[NCHARS] = {
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* ASCII */
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
}, string2[NCHARS];
STR s1 = { STRING1, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
STR s2 = { STRING2, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
static void setup(int *, char *, STR *, int, int);
static struct cset *setup(char *, STR *, int, int);
static void usage(void);
int
main(int argc, char **argv)
{
static int carray[NCHARS];
int ch, cnt, n, lastch, *p;
static int carray[NCHARS_SB];
struct cmap *map;
struct cset *delete, *squeeze;
int n, *p;
int Cflag, cflag, dflag, sflag, isstring2;
wint_t ch, cnt, i, lastch;
(void)setlocale(LC_ALL, "");
@ -162,13 +129,14 @@ main(int argc, char **argv)
if (!isstring2)
usage();
setup(string1, argv[0], &s1, cflag, Cflag);
setup(string2, argv[1], &s2, 0, 0);
delete = setup(argv[0], &s1, cflag, Cflag);
squeeze = setup(argv[1], &s2, 0, 0);
for (lastch = OOBCH; (ch = getchar()) != EOF;)
if (!string1[ch] && (!string2[ch] || lastch != ch)) {
for (lastch = OOBCH; (ch = getwchar()) != WEOF;)
if (!cset_in(delete, ch) &&
(lastch != ch || !cset_in(squeeze, ch))) {
lastch = ch;
(void)putchar(ch);
(void)putwchar(ch);
}
exit(0);
}
@ -181,11 +149,11 @@ main(int argc, char **argv)
if (isstring2)
usage();
setup(string1, argv[0], &s1, cflag, Cflag);
delete = setup(argv[0], &s1, cflag, Cflag);
while ((ch = getchar()) != EOF)
if (!string1[ch])
(void)putchar(ch);
while ((ch = getwchar()) != WEOF)
if (!cset_in(delete, ch))
(void)putwchar(ch);
exit(0);
}
@ -194,12 +162,12 @@ main(int argc, char **argv)
* Squeeze all characters (or complemented characters) in string1.
*/
if (sflag && !isstring2) {
setup(string1, argv[0], &s1, cflag, Cflag);
squeeze = setup(argv[0], &s1, cflag, Cflag);
for (lastch = OOBCH; (ch = getchar()) != EOF;)
if (!string1[ch] || lastch != ch) {
for (lastch = OOBCH; (ch = getwchar()) != WEOF;)
if (lastch != ch || !cset_in(squeeze, ch)) {
lastch = ch;
(void)putchar(ch);
(void)putwchar(ch);
}
exit(0);
}
@ -213,13 +181,19 @@ main(int argc, char **argv)
if (!isstring2)
usage();
map = cmap_alloc();
if (map == NULL)
err(1, NULL);
squeeze = cset_alloc();
if (squeeze == NULL)
err(1, NULL);
s1.str = argv[0];
if (cflag || Cflag) {
if (Cflag || cflag) {
cmap_default(map, OOBCH);
if ((s2.str = strdup(argv[1])) == NULL)
errx(1, "strdup(argv[1])");
for (cnt = NCHARS, p = string1; cnt--;)
*p++ = OOBCH;
} else
s2.str = argv[1];
@ -235,52 +209,83 @@ main(int argc, char **argv)
/* If string2 runs out of characters, use the last one specified. */
while (next(&s1)) {
again:
if (s1.state == SET_LOWER &&
s2.state == SET_UPPER &&
if (s1.state == CCLASS_LOWER &&
s2.state == CCLASS_UPPER &&
s1.cnt == 1 && s2.cnt == 1) {
do {
string1[s1.lastch] = ch = toupper(s1.lastch);
if (sflag && isupper(ch))
string2[ch] = 1;
ch = towupper(s1.lastch);
cmap_add(map, s1.lastch, ch);
if (sflag && iswupper(ch))
cset_add(squeeze, ch);
if (!next(&s1))
goto endloop;
} while (s1.state == SET_LOWER && s1.cnt > 1);
} while (s1.state == CCLASS_LOWER && s1.cnt > 1);
/* skip upper set */
do {
if (!next(&s2))
break;
} while (s2.state == SET_UPPER && s2.cnt > 1);
} while (s2.state == CCLASS_UPPER && s2.cnt > 1);
goto again;
} else if (s1.state == SET_UPPER &&
s2.state == SET_LOWER &&
} else if (s1.state == CCLASS_UPPER &&
s2.state == CCLASS_LOWER &&
s1.cnt == 1 && s2.cnt == 1) {
do {
string1[s1.lastch] = ch = tolower(s1.lastch);
if (sflag && islower(ch))
string2[ch] = 1;
ch = towlower(s1.lastch);
cmap_add(map, s1.lastch, ch);
if (sflag && iswlower(ch))
cset_add(squeeze, ch);
if (!next(&s1))
goto endloop;
} while (s1.state == SET_UPPER && s1.cnt > 1);
} while (s1.state == CCLASS_UPPER && s1.cnt > 1);
/* skip lower set */
do {
if (!next(&s2))
break;
} while (s2.state == SET_LOWER && s2.cnt > 1);
} while (s2.state == CCLASS_LOWER && s2.cnt > 1);
goto again;
} else {
string1[s1.lastch] = s2.lastch;
cmap_add(map, s1.lastch, s2.lastch);
if (sflag)
string2[s2.lastch] = 1;
cset_add(squeeze, s2.lastch);
}
(void)next(&s2);
}
endloop:
if (cflag || Cflag) {
for (p = carray, cnt = 0; cnt < NCHARS; cnt++) {
if (string1[cnt] == OOBCH && (!Cflag || ISCHAR(cnt)))
if (cflag || (Cflag && MB_CUR_MAX > 1)) {
/*
* This is somewhat tricky: since the character set is
* potentially huge, we need to avoid allocating a map
* entry for every character. Our strategy is to set the
* default mapping to the last character of string #2
* (= the one that gets automatically repeated), then to
* add back identity mappings for characters that should
* remain unchanged. We don't waste space on identity mappings
* for non-characters with the -C option; those are simulated
* in the I/O loop.
*/
s2.str = argv[1];
s2.state = NORMAL;
for (cnt = 0; cnt < WCHAR_MAX; cnt++) {
if (Cflag && !iswrune(cnt))
continue;
if (cmap_lookup(map, cnt) == OOBCH) {
if (next(&s2))
cmap_add(map, cnt, s2.lastch);
if (sflag)
cset_add(squeeze, s2.lastch);
} else
cmap_add(map, cnt, cnt);
if ((s2.state == EOS || s2.state == INFINITE) &&
cnt >= cmap_max(map))
break;
}
cmap_default(map, s2.lastch);
} else if (Cflag) {
for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) {
if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt))
*p++ = cnt;
else
string1[cnt] = cnt;
cmap_add(map, cnt, cnt);
}
n = p - carray;
if (Cflag && n > 1)
@ -290,46 +295,55 @@ main(int argc, char **argv)
s2.state = NORMAL;
for (cnt = 0; cnt < n; cnt++) {
(void)next(&s2);
string1[carray[cnt]] = s2.lastch;
cmap_add(map, carray[cnt], s2.lastch);
/*
* Chars taken from s2 can be different this time
* due to lack of complex upper/lower processing,
* so fill string2 again to not miss some.
*/
if (sflag)
string2[s2.lastch] = 1;
cset_add(squeeze, s2.lastch);
}
}
cset_cache(squeeze);
cmap_cache(map);
if (sflag)
for (lastch = OOBCH; (ch = getchar()) != EOF;) {
ch = string1[ch];
if (!string2[ch] || lastch != ch) {
for (lastch = OOBCH; (ch = getwchar()) != WEOF;) {
if (!Cflag || iswrune(ch))
ch = cmap_lookup(map, ch);
if (lastch != ch || !cset_in(squeeze, ch)) {
lastch = ch;
(void)putchar(ch);
(void)putwchar(ch);
}
}
else
while ((ch = getchar()) != EOF)
(void)putchar(string1[ch]);
while ((ch = getwchar()) != WEOF) {
if (!Cflag || iswrune(ch))
ch = cmap_lookup(map, ch);
(void)putwchar(ch);
}
exit (0);
}
static void
setup(int *string, char *arg, STR *str, int cflag, int Cflag)
static struct cset *
setup(char *arg, STR *str, int cflag, int Cflag)
{
int cnt, *p;
struct cset *cs;
cs = cset_alloc();
if (cs == NULL)
err(1, NULL);
str->str = arg;
bzero(string, NCHARS * sizeof(int));
while (next(str))
string[str->lastch] = 1;
if (cflag)
for (p = string, cnt = NCHARS; cnt--; ++p)
*p = !*p;
else if (Cflag)
for (cnt = 0; cnt < NCHARS; cnt++)
string[cnt] = !string[cnt] && ISCHAR(cnt);
cset_add(cs, str->lastch);
if (Cflag)
cset_addclass(cs, wctype("rune"), true);
if (cflag || Cflag)
cset_invert(cs);
cset_cache(cs);
return (cs);
}
int