Add support for multibyte characters. The challenge here was to use
data structures that scale better with large character sets, instead of arrays indexed by character value: - Sets of characters to delete/squeeze are stored in a new "cset" structure, which is implemented as a splay tree of extents. This structure has the ability to store character classes (ala wctype(3)), but this is not currently fully utilized. - Mappings between characters are stored in a new "cmap" structure, which is also a splay tree. - The parser no longer builds arrays containing all the characters in a particular class; instead, next() determines them on-the-fly using nextwctype(3).
This commit is contained in:
parent
6ec70e64c6
commit
ca99cfdd14
@ -1,6 +1,7 @@
|
||||
# @(#)Makefile 8.1 (Berkeley) 6/6/93
|
||||
# $FreeBSD$
|
||||
|
||||
PROG= tr
|
||||
SRCS= str.c tr.c
|
||||
SRCS= cmap.c cset.c str.c tr.c
|
||||
|
||||
.include <bsd.prog.mk>
|
||||
|
212
usr.bin/tr/cmap.c
Normal file
212
usr.bin/tr/cmap.c
Normal file
@ -0,0 +1,212 @@
|
||||
/*-
|
||||
* Copyright (c) 2004 Tim J. Robbins.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
* "Character map" ADT. Stores mappings between pairs of characters in a
|
||||
* splay tree, with a lookup table cache to simplify looking up the first
|
||||
* bunch of characters (which are presumably more common than others).
|
||||
*/
|
||||
|
||||
#include <sys/cdefs.h>
|
||||
__FBSDID("$FreeBSD$");
|
||||
|
||||
#include <assert.h>
|
||||
#include <limits.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <wchar.h>
|
||||
#include "cmap.h"
|
||||
|
||||
static struct cmapnode *cmap_splay(struct cmapnode *, wint_t);
|
||||
|
||||
/*
|
||||
* cmap_alloc --
|
||||
* Allocate a character map.
|
||||
*/
|
||||
struct cmap *
|
||||
cmap_alloc(void)
|
||||
{
|
||||
struct cmap *cm;
|
||||
|
||||
cm = malloc(sizeof(*cm));
|
||||
if (cm == NULL)
|
||||
return (NULL);
|
||||
cm->cm_root = NULL;
|
||||
cm->cm_def = CM_DEF_SELF;
|
||||
cm->cm_havecache = false;
|
||||
cm->cm_min = cm->cm_max = 0;
|
||||
return (cm);
|
||||
}
|
||||
|
||||
/*
|
||||
* cmap_add --
|
||||
* Add a mapping from "from" to "to" to the map.
|
||||
*/
|
||||
bool
|
||||
cmap_add(struct cmap *cm, wint_t from, wint_t to)
|
||||
{
|
||||
struct cmapnode *cmn, *ncmn;
|
||||
|
||||
cm->cm_havecache = false;
|
||||
|
||||
if (cm->cm_root == NULL) {
|
||||
cmn = malloc(sizeof(*cmn));
|
||||
if (cmn == NULL)
|
||||
return (false);
|
||||
cmn->cmn_from = from;
|
||||
cmn->cmn_to = to;
|
||||
cmn->cmn_left = cmn->cmn_right = NULL;
|
||||
cm->cm_root = cmn;
|
||||
cm->cm_min = cm->cm_max = from;
|
||||
return (true);
|
||||
}
|
||||
|
||||
cmn = cm->cm_root = cmap_splay(cm->cm_root, from);
|
||||
|
||||
if (cmn->cmn_from == from) {
|
||||
cmn->cmn_to = to;
|
||||
return (true);
|
||||
}
|
||||
|
||||
ncmn = malloc(sizeof(*ncmn));
|
||||
if (ncmn == NULL)
|
||||
return (false);
|
||||
ncmn->cmn_from = from;
|
||||
ncmn->cmn_to = to;
|
||||
if (from < cmn->cmn_from) {
|
||||
ncmn->cmn_left = cmn->cmn_left;
|
||||
ncmn->cmn_right = cmn;
|
||||
cmn->cmn_left = NULL;
|
||||
} else {
|
||||
ncmn->cmn_right = cmn->cmn_right;
|
||||
ncmn->cmn_left = cmn;
|
||||
cmn->cmn_right = NULL;
|
||||
}
|
||||
if (from < cm->cm_min)
|
||||
cm->cm_min = from;
|
||||
if (from > cm->cm_max)
|
||||
cm->cm_max = from;
|
||||
cm->cm_root = ncmn;
|
||||
|
||||
return (true);
|
||||
}
|
||||
|
||||
/*
|
||||
* cmap_lookup_hard --
|
||||
* Look up the mapping for a character using the cache.
|
||||
*/
|
||||
wint_t
|
||||
cmap_lookup_hard(struct cmap *cm, wint_t ch)
|
||||
{
|
||||
|
||||
if (cm->cm_root != NULL) {
|
||||
cm->cm_root = cmap_splay(cm->cm_root, ch);
|
||||
if (cm->cm_root->cmn_from == ch)
|
||||
return (cm->cm_root->cmn_to);
|
||||
}
|
||||
return (cm->cm_def == CM_DEF_SELF ? ch : cm->cm_def);
|
||||
}
|
||||
|
||||
/*
|
||||
* cmap_cache --
|
||||
* Update the cache.
|
||||
*/
|
||||
void
|
||||
cmap_cache(struct cmap *cm)
|
||||
{
|
||||
wint_t ch;
|
||||
|
||||
for (ch = 0; ch < CM_CACHE_SIZE; ch++)
|
||||
cm->cm_cache[ch] = cmap_lookup_hard(cm, ch);
|
||||
|
||||
cm->cm_havecache = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* cmap_default --
|
||||
* Change the value that characters without mappings map to, and
|
||||
* return the old value. The special character value CM_MAP_SELF
|
||||
* means characters map to themselves.
|
||||
*/
|
||||
wint_t
|
||||
cmap_default(struct cmap *cm, wint_t def)
|
||||
{
|
||||
wint_t old;
|
||||
|
||||
old = cm->cm_def;
|
||||
cm->cm_def = def;
|
||||
cm->cm_havecache = false;
|
||||
return (old);
|
||||
}
|
||||
|
||||
static struct cmapnode *
|
||||
cmap_splay(struct cmapnode *t, wint_t ch)
|
||||
{
|
||||
struct cmapnode N, *l, *r, *y;
|
||||
|
||||
/*
|
||||
* Based on public domain code from Sleator.
|
||||
*/
|
||||
|
||||
assert(t != NULL);
|
||||
|
||||
N.cmn_left = N.cmn_right = NULL;
|
||||
l = r = &N;
|
||||
for (;;) {
|
||||
if (ch < t->cmn_from) {
|
||||
if (t->cmn_left != NULL &&
|
||||
ch < t->cmn_left->cmn_from) {
|
||||
y = t->cmn_left;
|
||||
t->cmn_left = y->cmn_right;
|
||||
y->cmn_right = t;
|
||||
t = y;
|
||||
}
|
||||
if (t->cmn_left == NULL)
|
||||
break;
|
||||
r->cmn_left = t;
|
||||
r = t;
|
||||
t = t->cmn_left;
|
||||
} else if (ch > t->cmn_from) {
|
||||
if (t->cmn_right != NULL &&
|
||||
ch > t->cmn_right->cmn_from) {
|
||||
y = t->cmn_right;
|
||||
t->cmn_right = y->cmn_left;
|
||||
y->cmn_left = t;
|
||||
t = y;
|
||||
}
|
||||
if (t->cmn_right == NULL)
|
||||
break;
|
||||
l->cmn_right = t;
|
||||
l = t;
|
||||
t = t->cmn_right;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
l->cmn_right = t->cmn_left;
|
||||
r->cmn_left = t->cmn_right;
|
||||
t->cmn_left = N.cmn_right;
|
||||
t->cmn_right = N.cmn_left;
|
||||
return (t);
|
||||
}
|
83
usr.bin/tr/cmap.h
Normal file
83
usr.bin/tr/cmap.h
Normal file
@ -0,0 +1,83 @@
|
||||
/*-
|
||||
* Copyright (c) 2004 Tim J. Robbins.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $FreeBSD$
|
||||
*/
|
||||
|
||||
#ifndef CMAP_H
|
||||
#define CMAP_H
|
||||
|
||||
#include <limits.h>
|
||||
#include <stdbool.h>
|
||||
#include <wchar.h>
|
||||
|
||||
struct cmapnode {
|
||||
wint_t cmn_from;
|
||||
wint_t cmn_to;
|
||||
struct cmapnode *cmn_left;
|
||||
struct cmapnode *cmn_right;
|
||||
};
|
||||
|
||||
struct cmap {
|
||||
#define CM_CACHE_SIZE 128
|
||||
wint_t cm_cache[CM_CACHE_SIZE];
|
||||
bool cm_havecache;
|
||||
struct cmapnode *cm_root;
|
||||
#define CM_DEF_SELF -2
|
||||
wint_t cm_def;
|
||||
wint_t cm_min;
|
||||
wint_t cm_max;
|
||||
};
|
||||
|
||||
struct cmap * cmap_alloc(void);
|
||||
bool cmap_add(struct cmap *, wint_t, wint_t);
|
||||
wint_t cmap_lookup_hard(struct cmap *, wint_t);
|
||||
void cmap_cache(struct cmap *);
|
||||
wint_t cmap_default(struct cmap *, wint_t);
|
||||
|
||||
static __inline wint_t
|
||||
cmap_lookup(struct cmap *cm, wint_t from)
|
||||
{
|
||||
|
||||
if (from < CM_CACHE_SIZE && cm->cm_havecache)
|
||||
return (cm->cm_cache[from]);
|
||||
return (cmap_lookup_hard(cm, from));
|
||||
}
|
||||
|
||||
static __inline wint_t
|
||||
cmap_min(struct cmap *cm)
|
||||
{
|
||||
|
||||
return (cm->cm_min);
|
||||
}
|
||||
|
||||
static __inline wint_t
|
||||
cmap_max(struct cmap *cm)
|
||||
{
|
||||
|
||||
return (cm->cm_max);
|
||||
}
|
||||
|
||||
#endif
|
303
usr.bin/tr/cset.c
Normal file
303
usr.bin/tr/cset.c
Normal file
@ -0,0 +1,303 @@
|
||||
/*-
|
||||
* Copyright (c) 2004 Tim J. Robbins.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*/
|
||||
/*
|
||||
* "Set of characters" ADT implemented as a splay tree of extents, with
|
||||
* a lookup table cache to simplify looking up the first bunch of
|
||||
* characters (which are presumably more common than others).
|
||||
*/
|
||||
|
||||
#include <sys/cdefs.h>
|
||||
__FBSDID("$FreeBSD$");
|
||||
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
#include <wchar.h>
|
||||
#include <wctype.h>
|
||||
#include "cset.h"
|
||||
|
||||
static struct csnode * cset_delete(struct csnode *, wchar_t);
|
||||
static __inline int cset_rangecmp(struct csnode *, wchar_t);
|
||||
static struct csnode * cset_splay(struct csnode *, wchar_t);
|
||||
|
||||
/*
|
||||
* cset_alloc --
|
||||
* Allocate a set of characters.
|
||||
*/
|
||||
struct cset *
|
||||
cset_alloc(void)
|
||||
{
|
||||
struct cset *cs;
|
||||
|
||||
if ((cs = malloc(sizeof(*cs))) == NULL)
|
||||
return (NULL);
|
||||
cs->cs_root = NULL;
|
||||
cs->cs_classes = NULL;
|
||||
cs->cs_havecache = false;
|
||||
return (cs);
|
||||
}
|
||||
|
||||
/*
|
||||
* cset_add --
|
||||
* Add a character to the set.
|
||||
*/
|
||||
bool
|
||||
cset_add(struct cset *cs, wchar_t ch)
|
||||
{
|
||||
struct csnode *csn, *ncsn;
|
||||
wchar_t oval;
|
||||
|
||||
cs->cs_havecache = false;
|
||||
|
||||
/*
|
||||
* Inserting into empty tree; new item becomes the root.
|
||||
*/
|
||||
if (cs->cs_root == NULL) {
|
||||
csn = malloc(sizeof(*cs->cs_root));
|
||||
if (csn == NULL)
|
||||
return (false);
|
||||
csn->csn_left = csn->csn_right = NULL;
|
||||
csn->csn_min = csn->csn_max = ch;
|
||||
cs->cs_root = csn;
|
||||
return (true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Splay to check whether the item already exists, and otherwise,
|
||||
* where we should put it.
|
||||
*/
|
||||
csn = cs->cs_root = cset_splay(cs->cs_root, ch);
|
||||
|
||||
/*
|
||||
* Easy cases where we can avoid allocating a new node:
|
||||
* (a) node already exists.
|
||||
* (b) we can lower the extent's "min" to accomodate this
|
||||
* character without having to coalesce.
|
||||
* (c) we can raise the extent's "max" without having
|
||||
* to coalesce.
|
||||
*/
|
||||
if (cset_rangecmp(csn, ch) == 0)
|
||||
return (true);
|
||||
if (ch + 1 == csn->csn_min && (csn->csn_left == NULL ||
|
||||
ch > csn->csn_left->csn_max + 1)) {
|
||||
csn->csn_min--;
|
||||
return (true);
|
||||
}
|
||||
if (ch == csn->csn_max + 1 && (csn->csn_right == NULL ||
|
||||
ch + 1 < csn->csn_right->csn_min)) {
|
||||
csn->csn_max++;
|
||||
return (true);
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate a new node and link it into the tree as a direct
|
||||
* child of the root.
|
||||
*/
|
||||
ncsn = malloc(sizeof(*ncsn));
|
||||
if (ncsn == NULL)
|
||||
return (false);
|
||||
ncsn->csn_min = ncsn->csn_max = ch;
|
||||
if (cset_rangecmp(csn, ch) < 0) {
|
||||
ncsn->csn_left = csn->csn_left;
|
||||
ncsn->csn_right = csn;
|
||||
csn->csn_left = NULL;
|
||||
} else {
|
||||
ncsn->csn_right = csn->csn_right;
|
||||
ncsn->csn_left = csn;
|
||||
csn->csn_right = NULL;
|
||||
}
|
||||
cs->cs_root = ncsn;
|
||||
|
||||
/*
|
||||
* Splay to bring the newly inserted node to the root, then
|
||||
* coalesce with left and right neighbours if possible.
|
||||
*/
|
||||
csn = cs->cs_root = cset_splay(cs->cs_root, ch);
|
||||
if (csn->csn_left != NULL &&
|
||||
csn->csn_left->csn_max + 1 == csn->csn_min) {
|
||||
oval = csn->csn_left->csn_min;
|
||||
cs->cs_root = cset_delete(cs->cs_root,
|
||||
csn->csn_left->csn_min);
|
||||
ncsn->csn_min = oval;
|
||||
}
|
||||
csn = cs->cs_root = cset_splay(cs->cs_root, ch);
|
||||
if (csn->csn_right != NULL &&
|
||||
csn->csn_right->csn_min - 1 == csn->csn_max) {
|
||||
oval = csn->csn_right->csn_max;
|
||||
cs->cs_root = cset_delete(cs->cs_root,
|
||||
csn->csn_right->csn_min);
|
||||
ncsn->csn_max = oval;
|
||||
}
|
||||
|
||||
return (true);
|
||||
}
|
||||
|
||||
/*
|
||||
* cset_in_hard --
|
||||
* Determine whether a character is in the set without using
|
||||
* the cache.
|
||||
*/
|
||||
bool
|
||||
cset_in_hard(struct cset *cs, wchar_t ch)
|
||||
{
|
||||
struct csclass *csc;
|
||||
|
||||
for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
|
||||
if (csc->csc_invert ^ iswctype(ch, csc->csc_type) != 0)
|
||||
return (cs->cs_invert ^ true);
|
||||
if (cs->cs_root != NULL) {
|
||||
cs->cs_root = cset_splay(cs->cs_root, ch);
|
||||
return (cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch) == 0);
|
||||
}
|
||||
return (cs->cs_invert ^ false);
|
||||
}
|
||||
|
||||
/*
|
||||
* cset_cache --
|
||||
* Update the cache.
|
||||
*/
|
||||
void
|
||||
cset_cache(struct cset *cs)
|
||||
{
|
||||
wchar_t i;
|
||||
|
||||
for (i = 0; i < CS_CACHE_SIZE; i++)
|
||||
cs->cs_cache[i] = cset_in_hard(cs, i);
|
||||
|
||||
cs->cs_havecache = true;
|
||||
}
|
||||
|
||||
/*
|
||||
* cset_invert --
|
||||
* Invert the character set.
|
||||
*/
|
||||
void
|
||||
cset_invert(struct cset *cs)
|
||||
{
|
||||
|
||||
cs->cs_invert ^= true;
|
||||
cs->cs_havecache = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* cset_addclass --
|
||||
* Add a wctype()-style character class to the set, optionally
|
||||
* inverting it.
|
||||
*/
|
||||
bool
|
||||
cset_addclass(struct cset *cs, wctype_t type, bool invert)
|
||||
{
|
||||
struct csclass *csc;
|
||||
|
||||
csc = malloc(sizeof(*csc));
|
||||
if (csc == NULL)
|
||||
return (false);
|
||||
csc->csc_type = type;
|
||||
csc->csc_invert = invert;
|
||||
csc->csc_next = cs->cs_classes;
|
||||
cs->cs_classes = csc;
|
||||
cs->cs_havecache = false;
|
||||
return (true);
|
||||
}
|
||||
|
||||
static __inline int
|
||||
cset_rangecmp(struct csnode *t, wchar_t ch)
|
||||
{
|
||||
|
||||
if (ch < t->csn_min)
|
||||
return (-1);
|
||||
if (ch > t->csn_max)
|
||||
return (1);
|
||||
return (0);
|
||||
}
|
||||
|
||||
static struct csnode *
|
||||
cset_splay(struct csnode *t, wchar_t ch)
|
||||
{
|
||||
struct csnode N, *l, *r, *y;
|
||||
|
||||
/*
|
||||
* Based on public domain code from Sleator.
|
||||
*/
|
||||
|
||||
assert(t != NULL);
|
||||
|
||||
N.csn_left = N.csn_right = NULL;
|
||||
l = r = &N;
|
||||
for (;;) {
|
||||
if (cset_rangecmp(t, ch) < 0) {
|
||||
if (t->csn_left != NULL &&
|
||||
cset_rangecmp(t->csn_left, ch) < 0) {
|
||||
y = t->csn_left;
|
||||
t->csn_left = y->csn_right;
|
||||
y->csn_right = t;
|
||||
t = y;
|
||||
}
|
||||
if (t->csn_left == NULL)
|
||||
break;
|
||||
r->csn_left = t;
|
||||
r = t;
|
||||
t = t->csn_left;
|
||||
} else if (cset_rangecmp(t, ch) > 0) {
|
||||
if (t->csn_right != NULL &&
|
||||
cset_rangecmp(t->csn_right, ch) > 0) {
|
||||
y = t->csn_right;
|
||||
t->csn_right = y->csn_left;
|
||||
y->csn_left = t;
|
||||
t = y;
|
||||
}
|
||||
if (t->csn_right == NULL)
|
||||
break;
|
||||
l->csn_right = t;
|
||||
l = t;
|
||||
t = t->csn_right;
|
||||
} else
|
||||
break;
|
||||
}
|
||||
l->csn_right = t->csn_left;
|
||||
r->csn_left = t->csn_right;
|
||||
t->csn_left = N.csn_right;
|
||||
t->csn_right = N.csn_left;
|
||||
return (t);
|
||||
}
|
||||
|
||||
static struct csnode *
|
||||
cset_delete(struct csnode *t, wchar_t ch)
|
||||
{
|
||||
struct csnode *x;
|
||||
|
||||
assert(t != NULL);
|
||||
t = cset_splay(t, ch);
|
||||
assert(cset_rangecmp(t, ch) == 0);
|
||||
if (t->csn_left == NULL)
|
||||
x = t->csn_right;
|
||||
else {
|
||||
x = cset_splay(t->csn_left, ch);
|
||||
x->csn_right = t->csn_right;
|
||||
}
|
||||
free(t);
|
||||
return x;
|
||||
}
|
75
usr.bin/tr/cset.h
Normal file
75
usr.bin/tr/cset.h
Normal file
@ -0,0 +1,75 @@
|
||||
/*-
|
||||
* Copyright (c) 2004 Tim J. Robbins.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* $FreeBSD$
|
||||
*/
|
||||
|
||||
#ifndef CSET_H
|
||||
#define CSET_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <wchar.h>
|
||||
#include <wctype.h>
|
||||
|
||||
struct csnode {
|
||||
wchar_t csn_min;
|
||||
wchar_t csn_max;
|
||||
struct csnode *csn_left;
|
||||
struct csnode *csn_right;
|
||||
};
|
||||
|
||||
struct csclass {
|
||||
wctype_t csc_type;
|
||||
bool csc_invert;
|
||||
bool csc_value;
|
||||
struct csclass *csc_next;
|
||||
};
|
||||
|
||||
struct cset {
|
||||
#define CS_CACHE_SIZE 256
|
||||
bool cs_cache[CS_CACHE_SIZE];
|
||||
bool cs_havecache;
|
||||
struct csclass *cs_classes;
|
||||
struct csnode *cs_root;
|
||||
bool cs_invert;
|
||||
};
|
||||
|
||||
bool cset_addclass(struct cset *, wctype_t, bool);
|
||||
struct cset * cset_alloc(void);
|
||||
bool cset_add(struct cset *, wchar_t);
|
||||
void cset_invert(struct cset *);
|
||||
bool cset_in_hard(struct cset *, wchar_t);
|
||||
void cset_cache(struct cset *);
|
||||
|
||||
static __inline bool
|
||||
cset_in(struct cset *cs, wchar_t ch)
|
||||
{
|
||||
|
||||
if (ch < CS_CACHE_SIZE && cs->cs_havecache)
|
||||
return (cs->cs_cache[ch]);
|
||||
return (cset_in_hard(cs, ch));
|
||||
}
|
||||
|
||||
#endif /* CSET_H */
|
@ -35,20 +35,21 @@
|
||||
*/
|
||||
|
||||
#include <limits.h>
|
||||
#define NCHARS (UCHAR_MAX + 1) /* Number of possible characters. */
|
||||
#define OOBCH (UCHAR_MAX + 1) /* Out of band character value. */
|
||||
|
||||
#define NCHARS_SB (UCHAR_MAX + 1) /* Number of single-byte characters. */
|
||||
#define OOBCH -1 /* Out of band character value. */
|
||||
|
||||
typedef struct {
|
||||
enum { STRING1, STRING2 } which;
|
||||
enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE,
|
||||
SET, SET_UPPER, SET_LOWER } state;
|
||||
int cnt; /* character count */
|
||||
int lastch; /* last character */
|
||||
int equiv[NCHARS]; /* equivalence set */
|
||||
int *set; /* set of characters */
|
||||
char *str; /* user's string */
|
||||
CCLASS, CCLASS_UPPER, CCLASS_LOWER, SET } state;
|
||||
int cnt; /* character count */
|
||||
wint_t lastch; /* last character */
|
||||
wctype_t cclass; /* character class from wctype() */
|
||||
wint_t equiv[NCHARS_SB]; /* equivalence set */
|
||||
wint_t *set; /* set of characters */
|
||||
char *str; /* user's string */
|
||||
} STR;
|
||||
|
||||
int next(STR *);
|
||||
wint_t next(STR *);
|
||||
int charcoll(const void *, const void *);
|
||||
|
||||
|
163
usr.bin/tr/str.c
163
usr.bin/tr/str.c
@ -44,26 +44,31 @@ static const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95";
|
||||
|
||||
#include <ctype.h>
|
||||
#include <err.h>
|
||||
#include <errno.h>
|
||||
#include <stddef.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
#include <wctype.h>
|
||||
|
||||
#include "extern.h"
|
||||
|
||||
static int backslash(STR *, int *);
|
||||
static int bracket(STR *);
|
||||
static int c_class(const void *, const void *);
|
||||
static void genclass(STR *);
|
||||
static void genequiv(STR *);
|
||||
static int genrange(STR *, int);
|
||||
static void genseq(STR *);
|
||||
|
||||
int
|
||||
wint_t
|
||||
next(s)
|
||||
STR *s;
|
||||
{
|
||||
int ch, is_octal;
|
||||
int is_octal;
|
||||
wint_t ch;
|
||||
wchar_t wch;
|
||||
size_t clen;
|
||||
|
||||
switch (s->state) {
|
||||
case EOS:
|
||||
@ -71,7 +76,7 @@ next(s)
|
||||
case INFINITE:
|
||||
return (1);
|
||||
case NORMAL:
|
||||
switch (ch = (u_char)*s->str) {
|
||||
switch (*s->str) {
|
||||
case '\0':
|
||||
s->state = EOS;
|
||||
return (0);
|
||||
@ -83,9 +88,13 @@ next(s)
|
||||
return (next(s));
|
||||
/* FALLTHROUGH */
|
||||
default:
|
||||
clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
|
||||
if (clen == (size_t)-1 || clen == (size_t)-2 ||
|
||||
clen == 0)
|
||||
errc(1, EILSEQ, NULL);
|
||||
is_octal = 0;
|
||||
++s->str;
|
||||
s->lastch = ch;
|
||||
s->lastch = wch;
|
||||
s->str += clen;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -106,9 +115,18 @@ next(s)
|
||||
return (next(s));
|
||||
}
|
||||
return (1);
|
||||
case CCLASS:
|
||||
case CCLASS_UPPER:
|
||||
case CCLASS_LOWER:
|
||||
s->cnt++;
|
||||
ch = nextwctype(s->lastch, s->cclass);
|
||||
if (ch == -1) {
|
||||
s->state = NORMAL;
|
||||
return (next(s));
|
||||
}
|
||||
s->lastch = ch;
|
||||
return (1);
|
||||
case SET:
|
||||
case SET_UPPER:
|
||||
case SET_LOWER:
|
||||
if ((ch = s->set[s->cnt++]) == OOBCH) {
|
||||
s->state = NORMAL;
|
||||
return (next(s));
|
||||
@ -159,74 +177,21 @@ bracket(s)
|
||||
/* NOTREACHED */
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
const char *name;
|
||||
int (*func)(int);
|
||||
int *set;
|
||||
} CLASS;
|
||||
|
||||
static CLASS classes[] = {
|
||||
#undef isalnum
|
||||
{ "alnum", isalnum, NULL },
|
||||
#undef isalpha
|
||||
{ "alpha", isalpha, NULL },
|
||||
#undef isblank
|
||||
{ "blank", isblank, NULL },
|
||||
#undef iscntrl
|
||||
{ "cntrl", iscntrl, NULL },
|
||||
#undef isdigit
|
||||
{ "digit", isdigit, NULL },
|
||||
#undef isgraph
|
||||
{ "graph", isgraph, NULL },
|
||||
#undef islower
|
||||
{ "lower", islower, NULL },
|
||||
#undef isprint
|
||||
{ "print", isprint, NULL },
|
||||
#undef ispunct
|
||||
{ "punct", ispunct, NULL },
|
||||
#undef isspace
|
||||
{ "space", isspace, NULL },
|
||||
#undef isupper
|
||||
{ "upper", isupper, NULL },
|
||||
#undef isxdigit
|
||||
{ "xdigit", isxdigit, NULL },
|
||||
};
|
||||
|
||||
static void
|
||||
genclass(s)
|
||||
STR *s;
|
||||
{
|
||||
int cnt, (*func)(int);
|
||||
CLASS *cp, tmp;
|
||||
int *p;
|
||||
|
||||
tmp.name = s->str;
|
||||
if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
|
||||
sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
|
||||
if ((s->cclass = wctype(s->str)) == 0)
|
||||
errx(1, "unknown class %s", s->str);
|
||||
|
||||
if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
|
||||
err(1, "genclass() malloc");
|
||||
for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
|
||||
if ((func)(cnt))
|
||||
*p++ = cnt;
|
||||
*p = OOBCH;
|
||||
|
||||
s->cnt = 0;
|
||||
s->set = cp->set;
|
||||
s->lastch = -1; /* incremented before check in next() */
|
||||
if (strcmp(s->str, "upper") == 0)
|
||||
s->state = SET_UPPER;
|
||||
s->state = CCLASS_UPPER;
|
||||
else if (strcmp(s->str, "lower") == 0)
|
||||
s->state = SET_LOWER;
|
||||
s->state = CCLASS_LOWER;
|
||||
else
|
||||
s->state = SET;
|
||||
}
|
||||
|
||||
static int
|
||||
c_class(a, b)
|
||||
const void *a, *b;
|
||||
{
|
||||
return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name));
|
||||
s->state = CCLASS;
|
||||
}
|
||||
|
||||
static void
|
||||
@ -235,6 +200,8 @@ genequiv(s)
|
||||
{
|
||||
int i, p, pri;
|
||||
char src[2], dst[3];
|
||||
size_t clen;
|
||||
wchar_t wc;
|
||||
|
||||
if (*s->str == '\\') {
|
||||
s->equiv[0] = backslash(s, NULL);
|
||||
@ -242,10 +209,13 @@ genequiv(s)
|
||||
errx(1, "misplaced equivalence equals sign");
|
||||
s->str += 2;
|
||||
} else {
|
||||
s->equiv[0] = s->str[0];
|
||||
if (s->str[1] != '=')
|
||||
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
|
||||
if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
|
||||
errc(1, EILSEQ, NULL);
|
||||
s->equiv[0] = wc;
|
||||
if (s->str[clen] != '=')
|
||||
errx(1, "misplaced equivalence equals sign");
|
||||
s->str += 3;
|
||||
s->str += clen + 2;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -255,12 +225,13 @@ genequiv(s)
|
||||
* XXX Knows too much about how strxfrm() is implemented. Assumes
|
||||
* it fills the string with primary collation weight bytes. Only one-
|
||||
* to-one mappings are supported.
|
||||
* XXX Equivalence classes not supported in multibyte locales.
|
||||
*/
|
||||
src[0] = s->equiv[0];
|
||||
src[0] = (char)s->equiv[0];
|
||||
src[1] = '\0';
|
||||
if (strxfrm(dst, src, sizeof(dst)) == 1) {
|
||||
if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
|
||||
pri = (unsigned char)*dst;
|
||||
for (p = 1, i = 1; i < NCHARS; i++) {
|
||||
for (p = 1, i = 1; i < NCHARS_SB; i++) {
|
||||
*src = i;
|
||||
if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
|
||||
pri == (unsigned char)*dst)
|
||||
@ -280,28 +251,41 @@ genrange(STR *s, int was_octal)
|
||||
int stopval, octal;
|
||||
char *savestart;
|
||||
int n, cnt, *p;
|
||||
size_t clen;
|
||||
wchar_t wc;
|
||||
|
||||
octal = 0;
|
||||
savestart = s->str;
|
||||
stopval = *++s->str == '\\' ? backslash(s, &octal) : (u_char)*s->str++;
|
||||
if (!octal)
|
||||
octal = was_octal;
|
||||
|
||||
if ((octal && stopval < s->lastch) ||
|
||||
(!octal &&
|
||||
charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0)) {
|
||||
s->str = savestart;
|
||||
return (0);
|
||||
if (*++s->str == '\\')
|
||||
stopval = backslash(s, &octal);
|
||||
else {
|
||||
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
|
||||
if (clen == (size_t)-1 || clen == (size_t)-2)
|
||||
errc(1, EILSEQ, NULL);
|
||||
stopval = wc;
|
||||
s->str += clen;
|
||||
}
|
||||
if (octal) {
|
||||
/*
|
||||
* XXX Characters are not ordered according to collating sequence in
|
||||
* multibyte locales.
|
||||
*/
|
||||
if (octal || was_octal || MB_CUR_MAX > 1) {
|
||||
if (stopval < s->lastch) {
|
||||
s->str = savestart;
|
||||
return (0);
|
||||
}
|
||||
s->cnt = stopval - s->lastch + 1;
|
||||
s->state = RANGE;
|
||||
--s->lastch;
|
||||
return (1);
|
||||
}
|
||||
if ((s->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
|
||||
if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
|
||||
s->str = savestart;
|
||||
return (0);
|
||||
}
|
||||
if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
|
||||
err(1, "genrange() malloc");
|
||||
for (cnt = 0; cnt < NCHARS; cnt++)
|
||||
for (cnt = 0; cnt < NCHARS_SB; cnt++)
|
||||
if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
|
||||
charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
|
||||
*p++ = cnt;
|
||||
@ -320,14 +304,21 @@ genseq(s)
|
||||
STR *s;
|
||||
{
|
||||
char *ep;
|
||||
wchar_t wc;
|
||||
size_t clen;
|
||||
|
||||
if (s->which == STRING1)
|
||||
errx(1, "sequences only valid in string2");
|
||||
|
||||
if (*s->str == '\\')
|
||||
s->lastch = backslash(s, NULL);
|
||||
else
|
||||
s->lastch = *s->str++;
|
||||
else {
|
||||
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
|
||||
if (clen == (size_t)-1 || clen == (size_t)-2)
|
||||
errc(1, EILSEQ, NULL);
|
||||
s->lastch = wc;
|
||||
s->str += clen;
|
||||
}
|
||||
if (*s->str != '*')
|
||||
errx(1, "misplaced sequence asterisk");
|
||||
|
||||
|
218
usr.bin/tr/tr.c
218
usr.bin/tr/tr.c
@ -49,67 +49,34 @@ static const char sccsid[] = "@(#)tr.c 8.2 (Berkeley) 5/4/95";
|
||||
|
||||
#include <ctype.h>
|
||||
#include <err.h>
|
||||
#include <limits.h>
|
||||
#include <locale.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <wchar.h>
|
||||
#include <wctype.h>
|
||||
|
||||
#include "cmap.h"
|
||||
#include "cset.h"
|
||||
#include "extern.h"
|
||||
|
||||
/*
|
||||
* For -C option: determine whether a byte is a valid character in the
|
||||
* current character set (as defined by LC_CTYPE).
|
||||
*/
|
||||
#define ISCHAR(c) (iscntrl(c) || isprint(c))
|
||||
STR s1 = { STRING1, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL };
|
||||
STR s2 = { STRING2, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL };
|
||||
|
||||
static int string1[NCHARS] = {
|
||||
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* ASCII */
|
||||
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
|
||||
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
|
||||
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
|
||||
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
|
||||
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
|
||||
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
|
||||
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
|
||||
0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
|
||||
0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
|
||||
0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
|
||||
0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
|
||||
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
|
||||
0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
|
||||
0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
|
||||
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
|
||||
0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
|
||||
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
|
||||
0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
|
||||
0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
|
||||
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
|
||||
0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
|
||||
0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
|
||||
0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
|
||||
0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
|
||||
0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
|
||||
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
|
||||
0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
|
||||
0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
|
||||
0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
|
||||
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
|
||||
0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
|
||||
}, string2[NCHARS];
|
||||
|
||||
STR s1 = { STRING1, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
|
||||
STR s2 = { STRING2, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
|
||||
|
||||
static void setup(int *, char *, STR *, int, int);
|
||||
static struct cset *setup(char *, STR *, int, int);
|
||||
static void usage(void);
|
||||
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
static int carray[NCHARS];
|
||||
int ch, cnt, n, lastch, *p;
|
||||
static int carray[NCHARS_SB];
|
||||
struct cmap *map;
|
||||
struct cset *delete, *squeeze;
|
||||
int n, *p;
|
||||
int Cflag, cflag, dflag, sflag, isstring2;
|
||||
wint_t ch, cnt, i, lastch;
|
||||
|
||||
(void)setlocale(LC_ALL, "");
|
||||
|
||||
@ -162,13 +129,14 @@ main(int argc, char **argv)
|
||||
if (!isstring2)
|
||||
usage();
|
||||
|
||||
setup(string1, argv[0], &s1, cflag, Cflag);
|
||||
setup(string2, argv[1], &s2, 0, 0);
|
||||
delete = setup(argv[0], &s1, cflag, Cflag);
|
||||
squeeze = setup(argv[1], &s2, 0, 0);
|
||||
|
||||
for (lastch = OOBCH; (ch = getchar()) != EOF;)
|
||||
if (!string1[ch] && (!string2[ch] || lastch != ch)) {
|
||||
for (lastch = OOBCH; (ch = getwchar()) != WEOF;)
|
||||
if (!cset_in(delete, ch) &&
|
||||
(lastch != ch || !cset_in(squeeze, ch))) {
|
||||
lastch = ch;
|
||||
(void)putchar(ch);
|
||||
(void)putwchar(ch);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
@ -181,11 +149,11 @@ main(int argc, char **argv)
|
||||
if (isstring2)
|
||||
usage();
|
||||
|
||||
setup(string1, argv[0], &s1, cflag, Cflag);
|
||||
delete = setup(argv[0], &s1, cflag, Cflag);
|
||||
|
||||
while ((ch = getchar()) != EOF)
|
||||
if (!string1[ch])
|
||||
(void)putchar(ch);
|
||||
while ((ch = getwchar()) != WEOF)
|
||||
if (!cset_in(delete, ch))
|
||||
(void)putwchar(ch);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
@ -194,12 +162,12 @@ main(int argc, char **argv)
|
||||
* Squeeze all characters (or complemented characters) in string1.
|
||||
*/
|
||||
if (sflag && !isstring2) {
|
||||
setup(string1, argv[0], &s1, cflag, Cflag);
|
||||
squeeze = setup(argv[0], &s1, cflag, Cflag);
|
||||
|
||||
for (lastch = OOBCH; (ch = getchar()) != EOF;)
|
||||
if (!string1[ch] || lastch != ch) {
|
||||
for (lastch = OOBCH; (ch = getwchar()) != WEOF;)
|
||||
if (lastch != ch || !cset_in(squeeze, ch)) {
|
||||
lastch = ch;
|
||||
(void)putchar(ch);
|
||||
(void)putwchar(ch);
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
@ -213,13 +181,19 @@ main(int argc, char **argv)
|
||||
if (!isstring2)
|
||||
usage();
|
||||
|
||||
map = cmap_alloc();
|
||||
if (map == NULL)
|
||||
err(1, NULL);
|
||||
squeeze = cset_alloc();
|
||||
if (squeeze == NULL)
|
||||
err(1, NULL);
|
||||
|
||||
s1.str = argv[0];
|
||||
if (cflag || Cflag) {
|
||||
|
||||
if (Cflag || cflag) {
|
||||
cmap_default(map, OOBCH);
|
||||
if ((s2.str = strdup(argv[1])) == NULL)
|
||||
errx(1, "strdup(argv[1])");
|
||||
|
||||
for (cnt = NCHARS, p = string1; cnt--;)
|
||||
*p++ = OOBCH;
|
||||
} else
|
||||
s2.str = argv[1];
|
||||
|
||||
@ -235,52 +209,83 @@ main(int argc, char **argv)
|
||||
/* If string2 runs out of characters, use the last one specified. */
|
||||
while (next(&s1)) {
|
||||
again:
|
||||
if (s1.state == SET_LOWER &&
|
||||
s2.state == SET_UPPER &&
|
||||
if (s1.state == CCLASS_LOWER &&
|
||||
s2.state == CCLASS_UPPER &&
|
||||
s1.cnt == 1 && s2.cnt == 1) {
|
||||
do {
|
||||
string1[s1.lastch] = ch = toupper(s1.lastch);
|
||||
if (sflag && isupper(ch))
|
||||
string2[ch] = 1;
|
||||
ch = towupper(s1.lastch);
|
||||
cmap_add(map, s1.lastch, ch);
|
||||
if (sflag && iswupper(ch))
|
||||
cset_add(squeeze, ch);
|
||||
if (!next(&s1))
|
||||
goto endloop;
|
||||
} while (s1.state == SET_LOWER && s1.cnt > 1);
|
||||
} while (s1.state == CCLASS_LOWER && s1.cnt > 1);
|
||||
/* skip upper set */
|
||||
do {
|
||||
if (!next(&s2))
|
||||
break;
|
||||
} while (s2.state == SET_UPPER && s2.cnt > 1);
|
||||
} while (s2.state == CCLASS_UPPER && s2.cnt > 1);
|
||||
goto again;
|
||||
} else if (s1.state == SET_UPPER &&
|
||||
s2.state == SET_LOWER &&
|
||||
} else if (s1.state == CCLASS_UPPER &&
|
||||
s2.state == CCLASS_LOWER &&
|
||||
s1.cnt == 1 && s2.cnt == 1) {
|
||||
do {
|
||||
string1[s1.lastch] = ch = tolower(s1.lastch);
|
||||
if (sflag && islower(ch))
|
||||
string2[ch] = 1;
|
||||
ch = towlower(s1.lastch);
|
||||
cmap_add(map, s1.lastch, ch);
|
||||
if (sflag && iswlower(ch))
|
||||
cset_add(squeeze, ch);
|
||||
if (!next(&s1))
|
||||
goto endloop;
|
||||
} while (s1.state == SET_UPPER && s1.cnt > 1);
|
||||
} while (s1.state == CCLASS_UPPER && s1.cnt > 1);
|
||||
/* skip lower set */
|
||||
do {
|
||||
if (!next(&s2))
|
||||
break;
|
||||
} while (s2.state == SET_LOWER && s2.cnt > 1);
|
||||
} while (s2.state == CCLASS_LOWER && s2.cnt > 1);
|
||||
goto again;
|
||||
} else {
|
||||
string1[s1.lastch] = s2.lastch;
|
||||
cmap_add(map, s1.lastch, s2.lastch);
|
||||
if (sflag)
|
||||
string2[s2.lastch] = 1;
|
||||
cset_add(squeeze, s2.lastch);
|
||||
}
|
||||
(void)next(&s2);
|
||||
}
|
||||
endloop:
|
||||
if (cflag || Cflag) {
|
||||
for (p = carray, cnt = 0; cnt < NCHARS; cnt++) {
|
||||
if (string1[cnt] == OOBCH && (!Cflag || ISCHAR(cnt)))
|
||||
if (cflag || (Cflag && MB_CUR_MAX > 1)) {
|
||||
/*
|
||||
* This is somewhat tricky: since the character set is
|
||||
* potentially huge, we need to avoid allocating a map
|
||||
* entry for every character. Our strategy is to set the
|
||||
* default mapping to the last character of string #2
|
||||
* (= the one that gets automatically repeated), then to
|
||||
* add back identity mappings for characters that should
|
||||
* remain unchanged. We don't waste space on identity mappings
|
||||
* for non-characters with the -C option; those are simulated
|
||||
* in the I/O loop.
|
||||
*/
|
||||
s2.str = argv[1];
|
||||
s2.state = NORMAL;
|
||||
for (cnt = 0; cnt < WCHAR_MAX; cnt++) {
|
||||
if (Cflag && !iswrune(cnt))
|
||||
continue;
|
||||
if (cmap_lookup(map, cnt) == OOBCH) {
|
||||
if (next(&s2))
|
||||
cmap_add(map, cnt, s2.lastch);
|
||||
if (sflag)
|
||||
cset_add(squeeze, s2.lastch);
|
||||
} else
|
||||
cmap_add(map, cnt, cnt);
|
||||
if ((s2.state == EOS || s2.state == INFINITE) &&
|
||||
cnt >= cmap_max(map))
|
||||
break;
|
||||
}
|
||||
cmap_default(map, s2.lastch);
|
||||
} else if (Cflag) {
|
||||
for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) {
|
||||
if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt))
|
||||
*p++ = cnt;
|
||||
else
|
||||
string1[cnt] = cnt;
|
||||
cmap_add(map, cnt, cnt);
|
||||
}
|
||||
n = p - carray;
|
||||
if (Cflag && n > 1)
|
||||
@ -290,46 +295,55 @@ endloop:
|
||||
s2.state = NORMAL;
|
||||
for (cnt = 0; cnt < n; cnt++) {
|
||||
(void)next(&s2);
|
||||
string1[carray[cnt]] = s2.lastch;
|
||||
cmap_add(map, carray[cnt], s2.lastch);
|
||||
/*
|
||||
* Chars taken from s2 can be different this time
|
||||
* due to lack of complex upper/lower processing,
|
||||
* so fill string2 again to not miss some.
|
||||
*/
|
||||
if (sflag)
|
||||
string2[s2.lastch] = 1;
|
||||
cset_add(squeeze, s2.lastch);
|
||||
}
|
||||
}
|
||||
|
||||
cset_cache(squeeze);
|
||||
cmap_cache(map);
|
||||
|
||||
if (sflag)
|
||||
for (lastch = OOBCH; (ch = getchar()) != EOF;) {
|
||||
ch = string1[ch];
|
||||
if (!string2[ch] || lastch != ch) {
|
||||
for (lastch = OOBCH; (ch = getwchar()) != WEOF;) {
|
||||
if (!Cflag || iswrune(ch))
|
||||
ch = cmap_lookup(map, ch);
|
||||
if (lastch != ch || !cset_in(squeeze, ch)) {
|
||||
lastch = ch;
|
||||
(void)putchar(ch);
|
||||
(void)putwchar(ch);
|
||||
}
|
||||
}
|
||||
else
|
||||
while ((ch = getchar()) != EOF)
|
||||
(void)putchar(string1[ch]);
|
||||
while ((ch = getwchar()) != WEOF) {
|
||||
if (!Cflag || iswrune(ch))
|
||||
ch = cmap_lookup(map, ch);
|
||||
(void)putwchar(ch);
|
||||
}
|
||||
exit (0);
|
||||
}
|
||||
|
||||
static void
|
||||
setup(int *string, char *arg, STR *str, int cflag, int Cflag)
|
||||
static struct cset *
|
||||
setup(char *arg, STR *str, int cflag, int Cflag)
|
||||
{
|
||||
int cnt, *p;
|
||||
struct cset *cs;
|
||||
|
||||
cs = cset_alloc();
|
||||
if (cs == NULL)
|
||||
err(1, NULL);
|
||||
str->str = arg;
|
||||
bzero(string, NCHARS * sizeof(int));
|
||||
while (next(str))
|
||||
string[str->lastch] = 1;
|
||||
if (cflag)
|
||||
for (p = string, cnt = NCHARS; cnt--; ++p)
|
||||
*p = !*p;
|
||||
else if (Cflag)
|
||||
for (cnt = 0; cnt < NCHARS; cnt++)
|
||||
string[cnt] = !string[cnt] && ISCHAR(cnt);
|
||||
cset_add(cs, str->lastch);
|
||||
if (Cflag)
|
||||
cset_addclass(cs, wctype("rune"), true);
|
||||
if (cflag || Cflag)
|
||||
cset_invert(cs);
|
||||
cset_cache(cs);
|
||||
return (cs);
|
||||
}
|
||||
|
||||
int
|
||||
|
Loading…
x
Reference in New Issue
Block a user