From ca99cfdd14f4fa361788e3a15e1bfdd99e72b58c Mon Sep 17 00:00:00 2001
From: "Tim J. Robbins" <tjr@FreeBSD.org>
Date: Fri, 9 Jul 2004 02:08:07 +0000
Subject: [PATCH] Add support for multibyte characters. The challenge here was
 to use data structures that scale better with large character sets, instead
 of arrays indexed by character value: - Sets of characters to delete/squeeze
 are stored in a new "cset" structure, which is implemented as a splay tree of
 extents. This structure has the ability to store character classes (ala
 wctype(3)), but this is not currently fully utilized. - Mappings between
 characters are stored in a new "cmap" structure, which is also a splay tree.
 - The parser no longer builds arrays containing all the characters in a
 particular class; instead, next() determines them on-the-fly using
 nextwctype(3).

---
 usr.bin/tr/Makefile |   3 +-
 usr.bin/tr/cmap.c   | 212 +++++++++++++++++++++++++++++++
 usr.bin/tr/cmap.h   |  83 ++++++++++++
 usr.bin/tr/cset.c   | 303 ++++++++++++++++++++++++++++++++++++++++++++
 usr.bin/tr/cset.h   |  75 +++++++++++
 usr.bin/tr/extern.h |  21 +--
 usr.bin/tr/str.c    | 163 +++++++++++-------------
 usr.bin/tr/tr.c     | 218 ++++++++++++++++---------------
 8 files changed, 879 insertions(+), 199 deletions(-)
 create mode 100644 usr.bin/tr/cmap.c
 create mode 100644 usr.bin/tr/cmap.h
 create mode 100644 usr.bin/tr/cset.c
 create mode 100644 usr.bin/tr/cset.h

diff --git a/usr.bin/tr/Makefile b/usr.bin/tr/Makefile
index 7124942bdefc..00bdd8d5bb7a 100644
--- a/usr.bin/tr/Makefile
+++ b/usr.bin/tr/Makefile
@@ -1,6 +1,7 @@
 #	@(#)Makefile	8.1 (Berkeley) 6/6/93
+# $FreeBSD$
 
 PROG=	tr
-SRCS=	str.c tr.c
+SRCS=	cmap.c cset.c str.c tr.c
 
 .include <bsd.prog.mk>
diff --git a/usr.bin/tr/cmap.c b/usr.bin/tr/cmap.c
new file mode 100644
index 000000000000..a2cac3377d02
--- /dev/null
+++ b/usr.bin/tr/cmap.c
@@ -0,0 +1,212 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * "Character map" ADT. Stores mappings between pairs of characters in a
+ * splay tree, with a lookup table cache to simplify looking up the first
+ * bunch of characters (which are presumably more common than others).
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <wchar.h>
+#include "cmap.h"
+
+static struct cmapnode *cmap_splay(struct cmapnode *, wint_t);
+
+/*
+ * cmap_alloc --
+ *	Allocate a character map.
+ */
+struct cmap *
+cmap_alloc(void)
+{
+	struct cmap *cm;
+
+	cm = malloc(sizeof(*cm));
+	if (cm == NULL)
+		return (NULL);
+	cm->cm_root = NULL;
+	cm->cm_def = CM_DEF_SELF;
+	cm->cm_havecache = false;
+	cm->cm_min = cm->cm_max = 0;
+	return (cm);
+}
+
+/*
+ * cmap_add --
+ *	Add a mapping from "from" to "to" to the map.
+ */
+bool
+cmap_add(struct cmap *cm, wint_t from, wint_t to)
+{
+	struct cmapnode *cmn, *ncmn;
+
+	cm->cm_havecache = false;
+
+	if (cm->cm_root == NULL) {
+		cmn = malloc(sizeof(*cmn));
+		if (cmn == NULL)
+			return (false);
+		cmn->cmn_from = from;
+		cmn->cmn_to = to;
+		cmn->cmn_left = cmn->cmn_right = NULL;
+		cm->cm_root = cmn;
+		cm->cm_min = cm->cm_max = from;
+		return (true);
+	}
+
+	cmn = cm->cm_root = cmap_splay(cm->cm_root, from);
+
+	if (cmn->cmn_from == from) {
+		cmn->cmn_to = to;
+		return (true);
+	}
+
+	ncmn = malloc(sizeof(*ncmn));
+	if (ncmn == NULL)
+		return (false);
+	ncmn->cmn_from = from;
+	ncmn->cmn_to = to;
+	if (from < cmn->cmn_from) {
+		ncmn->cmn_left = cmn->cmn_left;
+		ncmn->cmn_right = cmn;
+		cmn->cmn_left = NULL;
+	} else {
+		ncmn->cmn_right = cmn->cmn_right;
+		ncmn->cmn_left = cmn;
+		cmn->cmn_right = NULL;
+	}
+	if (from < cm->cm_min)
+		cm->cm_min = from;
+	if (from > cm->cm_max)
+		cm->cm_max = from;
+        cm->cm_root = ncmn;
+
+	return (true);
+}
+
+/*
+ * cmap_lookup_hard --
+ *	Look up the mapping for a character using the cache.
+ */
+wint_t
+cmap_lookup_hard(struct cmap *cm, wint_t ch)
+{
+
+	if (cm->cm_root != NULL) {
+		cm->cm_root = cmap_splay(cm->cm_root, ch);
+		if (cm->cm_root->cmn_from == ch)
+			return (cm->cm_root->cmn_to);
+	}
+	return (cm->cm_def == CM_DEF_SELF ? ch : cm->cm_def);
+}
+
+/*
+ * cmap_cache --
+ *	Update the cache.
+ */
+void
+cmap_cache(struct cmap *cm)
+{
+	wint_t ch;
+
+	for (ch = 0; ch < CM_CACHE_SIZE; ch++)
+		cm->cm_cache[ch] = cmap_lookup_hard(cm, ch);
+
+	cm->cm_havecache = true;
+}
+
+/*
+ * cmap_default --
+ *	Change the value that characters without mappings map to, and
+ *	return the old value. The special character value CM_MAP_SELF
+ *	means characters map to themselves.
+ */
+wint_t
+cmap_default(struct cmap *cm, wint_t def)
+{
+	wint_t old;
+
+	old = cm->cm_def;
+	cm->cm_def = def;
+	cm->cm_havecache = false;
+	return (old);
+}
+
+static struct cmapnode *
+cmap_splay(struct cmapnode *t, wint_t ch)
+{
+	struct cmapnode N, *l, *r, *y;
+
+	/*
+	 * Based on public domain code from Sleator.
+	 */
+
+	assert(t != NULL);
+
+	N.cmn_left = N.cmn_right = NULL;
+	l = r = &N;
+	for (;;) {
+		if (ch < t->cmn_from) {
+			if (t->cmn_left != NULL &&
+			    ch < t->cmn_left->cmn_from) {
+				y = t->cmn_left;
+				t->cmn_left = y->cmn_right;
+				y->cmn_right = t;
+				t = y;
+			}
+			if (t->cmn_left == NULL)
+				break;
+			r->cmn_left = t;
+			r = t;
+			t = t->cmn_left;
+		} else if (ch > t->cmn_from) {
+			if (t->cmn_right != NULL &&
+			    ch > t->cmn_right->cmn_from) {
+				y = t->cmn_right;
+				t->cmn_right = y->cmn_left;
+				y->cmn_left = t;
+				t = y;
+			}
+			if (t->cmn_right == NULL)
+				break;
+			l->cmn_right = t;
+			l = t;
+			t = t->cmn_right;
+		} else
+			break;
+	}
+	l->cmn_right = t->cmn_left;
+	r->cmn_left = t->cmn_right;
+	t->cmn_left = N.cmn_right;
+	t->cmn_right = N.cmn_left;
+	return (t);
+}
diff --git a/usr.bin/tr/cmap.h b/usr.bin/tr/cmap.h
new file mode 100644
index 000000000000..9a81e134678f
--- /dev/null
+++ b/usr.bin/tr/cmap.h
@@ -0,0 +1,83 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef CMAP_H
+#define	CMAP_H
+
+#include <limits.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+struct cmapnode {
+	wint_t		cmn_from;
+	wint_t		cmn_to;
+	struct cmapnode	*cmn_left;
+	struct cmapnode	*cmn_right;
+};
+
+struct cmap {
+#define	CM_CACHE_SIZE	128
+	wint_t		cm_cache[CM_CACHE_SIZE];
+	bool		cm_havecache;
+	struct cmapnode	*cm_root;
+#define	CM_DEF_SELF	-2
+	wint_t		cm_def;
+	wint_t		cm_min;
+	wint_t		cm_max;
+};
+
+struct cmap *	cmap_alloc(void);
+bool		cmap_add(struct cmap *, wint_t, wint_t);
+wint_t		cmap_lookup_hard(struct cmap *, wint_t);
+void		cmap_cache(struct cmap *);
+wint_t		cmap_default(struct cmap *, wint_t);
+
+static __inline wint_t
+cmap_lookup(struct cmap *cm, wint_t from)
+{
+
+	if (from < CM_CACHE_SIZE && cm->cm_havecache)
+		return (cm->cm_cache[from]);
+	return (cmap_lookup_hard(cm, from));
+}
+
+static __inline wint_t
+cmap_min(struct cmap *cm)
+{
+
+	return (cm->cm_min);
+}
+
+static __inline wint_t
+cmap_max(struct cmap *cm)
+{
+
+	return (cm->cm_max);
+}
+
+#endif
diff --git a/usr.bin/tr/cset.c b/usr.bin/tr/cset.c
new file mode 100644
index 000000000000..05dbd7753d8e
--- /dev/null
+++ b/usr.bin/tr/cset.c
@@ -0,0 +1,303 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * "Set of characters" ADT implemented as a splay tree of extents, with
+ * a lookup table cache to simplify looking up the first bunch of
+ * characters (which are presumably more common than others).
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <wchar.h>
+#include <wctype.h>
+#include "cset.h"
+
+static struct csnode *	cset_delete(struct csnode *, wchar_t);
+static __inline int	cset_rangecmp(struct csnode *, wchar_t);
+static struct csnode *	cset_splay(struct csnode *, wchar_t);
+
+/*
+ * cset_alloc --
+ *	Allocate a set of characters.
+ */
+struct cset *
+cset_alloc(void)
+{
+	struct cset *cs;
+
+	if ((cs = malloc(sizeof(*cs))) == NULL)
+		return (NULL);
+	cs->cs_root = NULL;
+	cs->cs_classes = NULL;
+	cs->cs_havecache = false;
+	return (cs);
+}
+
+/*
+ * cset_add --
+ *	Add a character to the set.
+ */
+bool
+cset_add(struct cset *cs, wchar_t ch)
+{
+	struct csnode *csn, *ncsn;
+	wchar_t oval;
+
+	cs->cs_havecache = false;
+
+	/*
+	 * Inserting into empty tree; new item becomes the root.
+	 */
+	if (cs->cs_root == NULL) {
+		csn = malloc(sizeof(*cs->cs_root));
+		if (csn == NULL)
+			return (false);
+		csn->csn_left = csn->csn_right = NULL;
+		csn->csn_min = csn->csn_max = ch;
+		cs->cs_root = csn;
+		return (true);
+	}
+
+	/*
+	 * Splay to check whether the item already exists, and otherwise,
+	 * where we should put it.
+	 */
+	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
+
+	/*
+	 * Easy cases where we can avoid allocating a new node:
+	 *	(a) node already exists.
+	 *	(b) we can lower the extent's "min" to accomodate this
+	 *	    character without having to coalesce.
+	 *	(c) we can raise the extent's "max" without having
+	 *	    to coalesce.
+	 */
+	if (cset_rangecmp(csn, ch) == 0)
+		return (true);
+	if (ch + 1 == csn->csn_min && (csn->csn_left == NULL ||
+	    ch > csn->csn_left->csn_max + 1)) {
+		csn->csn_min--;
+		return (true);
+	}
+	if (ch == csn->csn_max + 1 && (csn->csn_right == NULL ||
+	    ch + 1 < csn->csn_right->csn_min)) {
+		csn->csn_max++;
+		return (true);
+	}
+
+	/*
+	 * Allocate a new node and link it into the tree as a direct
+	 * child of the root.
+	 */
+	ncsn = malloc(sizeof(*ncsn));
+	if (ncsn == NULL)
+		return (false);
+	ncsn->csn_min = ncsn->csn_max = ch;
+	if (cset_rangecmp(csn, ch) < 0) {
+		ncsn->csn_left = csn->csn_left;
+		ncsn->csn_right = csn;
+		csn->csn_left = NULL;
+	} else {
+		ncsn->csn_right = csn->csn_right;
+		ncsn->csn_left = csn;
+		csn->csn_right = NULL;
+	}
+	cs->cs_root = ncsn;
+
+	/*
+	 * Splay to bring the newly inserted node to the root, then
+	 * coalesce with left and right neighbours if possible.
+	 */
+	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
+	if (csn->csn_left != NULL &&
+	    csn->csn_left->csn_max + 1 == csn->csn_min) {
+		oval = csn->csn_left->csn_min;
+		cs->cs_root = cset_delete(cs->cs_root,
+		    csn->csn_left->csn_min);
+		ncsn->csn_min = oval;
+	}
+	csn = cs->cs_root = cset_splay(cs->cs_root, ch);
+	if (csn->csn_right != NULL &&
+	    csn->csn_right->csn_min - 1 == csn->csn_max) {
+		oval = csn->csn_right->csn_max;
+		cs->cs_root = cset_delete(cs->cs_root,
+		    csn->csn_right->csn_min);
+		ncsn->csn_max = oval;
+	}
+
+	return (true);
+}
+
+/*
+ * cset_in_hard --
+ *	Determine whether a character is in the set without using
+ *	the cache.
+ */
+bool
+cset_in_hard(struct cset *cs, wchar_t ch)
+{
+	struct csclass *csc;
+
+	for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
+		if (csc->csc_invert ^ iswctype(ch, csc->csc_type) != 0)
+			return (cs->cs_invert ^ true);
+	if (cs->cs_root != NULL) {
+		cs->cs_root = cset_splay(cs->cs_root, ch);
+		return (cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch) == 0);
+	}
+	return (cs->cs_invert ^ false);
+}
+
+/*
+ * cset_cache --
+ *	Update the cache.
+ */
+void
+cset_cache(struct cset *cs)
+{
+	wchar_t i;
+
+	for (i = 0; i < CS_CACHE_SIZE; i++)
+		cs->cs_cache[i] = cset_in_hard(cs, i);
+
+	cs->cs_havecache = true;
+}
+
+/*
+ * cset_invert --
+ *	Invert the character set.
+ */
+void
+cset_invert(struct cset *cs)
+{
+
+	cs->cs_invert ^= true;
+	cs->cs_havecache = false;
+}
+
+/*
+ * cset_addclass --
+ *	Add a wctype()-style character class to the set, optionally
+ *	inverting it.
+ */
+bool
+cset_addclass(struct cset *cs, wctype_t type, bool invert)
+{
+	struct csclass *csc;
+
+	csc = malloc(sizeof(*csc));
+	if (csc == NULL)
+		return (false);
+	csc->csc_type = type;
+	csc->csc_invert = invert;
+	csc->csc_next = cs->cs_classes;
+	cs->cs_classes = csc;
+	cs->cs_havecache = false;
+	return (true);
+}
+
+static __inline int
+cset_rangecmp(struct csnode *t, wchar_t ch)
+{
+
+	if (ch < t->csn_min)
+		return (-1);
+	if (ch > t->csn_max)
+		return (1);
+	return (0);
+}
+
+static struct csnode *
+cset_splay(struct csnode *t, wchar_t ch)
+{
+	struct csnode N, *l, *r, *y;
+
+	/*
+	 * Based on public domain code from Sleator.
+	 */
+
+	assert(t != NULL);
+
+	N.csn_left = N.csn_right = NULL;
+	l = r = &N;
+	for (;;) {
+		if (cset_rangecmp(t, ch) < 0) {
+			if (t->csn_left != NULL &&
+			    cset_rangecmp(t->csn_left, ch) < 0) {
+				y = t->csn_left;
+				t->csn_left = y->csn_right;
+				y->csn_right = t;
+				t = y;
+			}
+			if (t->csn_left == NULL)
+				break;
+			r->csn_left = t;
+			r = t;
+			t = t->csn_left;
+		} else if (cset_rangecmp(t, ch) > 0) {
+			if (t->csn_right != NULL &&
+			    cset_rangecmp(t->csn_right, ch) > 0) {
+				y = t->csn_right;
+				t->csn_right = y->csn_left;
+				y->csn_left = t;
+				t = y;
+			}
+			if (t->csn_right == NULL)
+				break;
+			l->csn_right = t;
+			l = t;
+			t = t->csn_right;
+		} else
+			break;
+	}
+	l->csn_right = t->csn_left;
+	r->csn_left = t->csn_right;
+	t->csn_left = N.csn_right;
+	t->csn_right = N.csn_left;
+	return (t);
+}
+
+static struct csnode *
+cset_delete(struct csnode *t, wchar_t ch)
+{
+	struct csnode *x;
+
+	assert(t != NULL);
+	t = cset_splay(t, ch);
+	assert(cset_rangecmp(t, ch) == 0);
+	if (t->csn_left == NULL)
+		x = t->csn_right;
+	else {
+		x = cset_splay(t->csn_left, ch);
+		x->csn_right = t->csn_right;
+	}
+	free(t);
+	return x;
+}
diff --git a/usr.bin/tr/cset.h b/usr.bin/tr/cset.h
new file mode 100644
index 000000000000..b85349315666
--- /dev/null
+++ b/usr.bin/tr/cset.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef CSET_H
+#define	CSET_H
+
+#include <stdbool.h>
+#include <wchar.h>
+#include <wctype.h>
+
+struct csnode {
+	wchar_t		csn_min;
+	wchar_t		csn_max;
+	struct csnode	*csn_left;
+	struct csnode	*csn_right;
+};
+
+struct csclass {
+	wctype_t	csc_type;
+	bool		csc_invert;
+	bool		csc_value;
+	struct csclass	*csc_next;
+};
+
+struct cset {
+#define	CS_CACHE_SIZE	256
+	bool		cs_cache[CS_CACHE_SIZE];
+	bool		cs_havecache;
+	struct csclass	*cs_classes;
+	struct csnode	*cs_root;
+	bool		cs_invert;
+};
+
+bool			cset_addclass(struct cset *, wctype_t, bool);
+struct cset *		cset_alloc(void);
+bool 			cset_add(struct cset *, wchar_t);
+void			cset_invert(struct cset *);
+bool			cset_in_hard(struct cset *, wchar_t);
+void			cset_cache(struct cset *);
+
+static __inline bool
+cset_in(struct cset *cs, wchar_t ch)
+{
+
+	if (ch < CS_CACHE_SIZE && cs->cs_havecache)
+		return (cs->cs_cache[ch]);
+	return (cset_in_hard(cs, ch));
+}
+
+#endif	/* CSET_H */
diff --git a/usr.bin/tr/extern.h b/usr.bin/tr/extern.h
index a5ed577c09a6..2fdbdf33438c 100644
--- a/usr.bin/tr/extern.h
+++ b/usr.bin/tr/extern.h
@@ -35,20 +35,21 @@
  */
 
 #include <limits.h>
-#define	NCHARS	(UCHAR_MAX + 1)		/* Number of possible characters. */
-#define	OOBCH	(UCHAR_MAX + 1)		/* Out of band character value. */
+
+#define	NCHARS_SB	(UCHAR_MAX + 1)	/* Number of single-byte characters. */
+#define	OOBCH		-1		/* Out of band character value. */
 
 typedef struct {
 	enum { STRING1, STRING2 } which;
 	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE,
-	       SET, SET_UPPER, SET_LOWER } state;
-	int	 cnt;			/* character count */
-	int	 lastch;		/* last character */
-	int	equiv[NCHARS];		/* equivalence set */
-	int	*set;			/* set of characters */
-	char	*str;			/* user's string */
+	       CCLASS, CCLASS_UPPER, CCLASS_LOWER, SET } state;
+	int		cnt;		/* character count */
+	wint_t		lastch;		/* last character */
+	wctype_t	cclass;		/* character class from wctype() */
+	wint_t		equiv[NCHARS_SB];	/* equivalence set */
+	wint_t		*set;		/* set of characters */
+	char		*str;		/* user's string */
 } STR;
 
-int	 next(STR *);
+wint_t	 next(STR *);
 int charcoll(const void *, const void *);
-
diff --git a/usr.bin/tr/str.c b/usr.bin/tr/str.c
index 3365cafb946a..f28b243d2ec5 100644
--- a/usr.bin/tr/str.c
+++ b/usr.bin/tr/str.c
@@ -44,26 +44,31 @@ static const char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
 
 #include <ctype.h>
 #include <err.h>
+#include <errno.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <wchar.h>
+#include <wctype.h>
 
 #include "extern.h"
 
 static int      backslash(STR *, int *);
 static int	bracket(STR *);
-static int	c_class(const void *, const void *);
 static void	genclass(STR *);
 static void	genequiv(STR *);
 static int      genrange(STR *, int);
 static void	genseq(STR *);
 
-int
+wint_t
 next(s)
 	STR *s;
 {
-	int ch, is_octal;
+	int is_octal;
+	wint_t ch;
+	wchar_t wch;
+	size_t clen;
 
 	switch (s->state) {
 	case EOS:
@@ -71,7 +76,7 @@ next(s)
 	case INFINITE:
 		return (1);
 	case NORMAL:
-		switch (ch = (u_char)*s->str) {
+		switch (*s->str) {
 		case '\0':
 			s->state = EOS;
 			return (0);
@@ -83,9 +88,13 @@ next(s)
 				return (next(s));
 			/* FALLTHROUGH */
 		default:
+			clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
+			if (clen == (size_t)-1 || clen == (size_t)-2 ||
+			    clen == 0)
+				errc(1, EILSEQ, NULL);
 			is_octal = 0;
-			++s->str;
-			s->lastch = ch;
+			s->lastch = wch;
+			s->str += clen;
 			break;
 		}
 
@@ -106,9 +115,18 @@ next(s)
 			return (next(s));
 		}
 		return (1);
+	case CCLASS:
+	case CCLASS_UPPER:
+	case CCLASS_LOWER:
+		s->cnt++;
+		ch = nextwctype(s->lastch, s->cclass);
+		if (ch == -1) {
+			s->state = NORMAL;
+			return (next(s));
+		}
+		s->lastch = ch;
+		return (1);
 	case SET:
-	case SET_UPPER:
-	case SET_LOWER:
 		if ((ch = s->set[s->cnt++]) == OOBCH) {
 			s->state = NORMAL;
 			return (next(s));
@@ -159,74 +177,21 @@ bracket(s)
 	/* NOTREACHED */
 }
 
-typedef struct {
-	const char *name;
-	int (*func)(int);
-	int *set;
-} CLASS;
-
-static CLASS classes[] = {
-#undef isalnum
-	{ "alnum",  isalnum,  NULL },
-#undef isalpha
-	{ "alpha",  isalpha,  NULL },
-#undef isblank
-	{ "blank",  isblank,  NULL },
-#undef iscntrl
-	{ "cntrl",  iscntrl,  NULL },
-#undef isdigit
-	{ "digit",  isdigit,  NULL },
-#undef isgraph
-	{ "graph",  isgraph,  NULL },
-#undef islower
-	{ "lower",  islower,  NULL },
-#undef isprint
-	{ "print",  isprint,  NULL },
-#undef ispunct
-	{ "punct",  ispunct,  NULL },
-#undef isspace
-	{ "space",  isspace,  NULL },
-#undef isupper
-	{ "upper",  isupper,  NULL },
-#undef isxdigit
-	{ "xdigit", isxdigit, NULL },
-};
-
 static void
 genclass(s)
 	STR *s;
 {
-	int cnt, (*func)(int);
-	CLASS *cp, tmp;
-	int *p;
 
-	tmp.name = s->str;
-	if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
-	    sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
+	if ((s->cclass = wctype(s->str)) == 0)
 		errx(1, "unknown class %s", s->str);
-
-	if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
-		err(1, "genclass() malloc");
-	for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
-		if ((func)(cnt))
-			*p++ = cnt;
-	*p = OOBCH;
-
 	s->cnt = 0;
-	s->set = cp->set;
+	s->lastch = -1;		/* incremented before check in next() */
 	if (strcmp(s->str, "upper") == 0)
-		s->state = SET_UPPER;
+		s->state = CCLASS_UPPER;
 	else if (strcmp(s->str, "lower") == 0)
-		s->state = SET_LOWER;
+		s->state = CCLASS_LOWER;
 	else
-		s->state = SET;
-}
-
-static int
-c_class(a, b)
-	const void *a, *b;
-{
-	return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name));
+		s->state = CCLASS;
 }
 
 static void
@@ -235,6 +200,8 @@ genequiv(s)
 {
 	int i, p, pri;
 	char src[2], dst[3];
+	size_t clen;
+	wchar_t wc;
 
 	if (*s->str == '\\') {
 		s->equiv[0] = backslash(s, NULL);
@@ -242,10 +209,13 @@ genequiv(s)
 			errx(1, "misplaced equivalence equals sign");
 		s->str += 2;
 	} else {
-		s->equiv[0] = s->str[0];
-		if (s->str[1] != '=')
+		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
+		if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
+			errc(1, EILSEQ, NULL);
+		s->equiv[0] = wc;
+		if (s->str[clen] != '=')
 			errx(1, "misplaced equivalence equals sign");
-		s->str += 3;
+		s->str += clen + 2;
 	}
 
 	/*
@@ -255,12 +225,13 @@ genequiv(s)
 	 * XXX Knows too much about how strxfrm() is implemented. Assumes
 	 * it fills the string with primary collation weight bytes. Only one-
 	 * to-one mappings are supported.
+	 * XXX Equivalence classes not supported in multibyte locales.
 	 */
-	src[0] = s->equiv[0];
+	src[0] = (char)s->equiv[0];
 	src[1] = '\0';
-	if (strxfrm(dst, src, sizeof(dst)) == 1) {
+	if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
 		pri = (unsigned char)*dst;
-		for (p = 1, i = 1; i < NCHARS; i++) {
+		for (p = 1, i = 1; i < NCHARS_SB; i++) {
 			*src = i;
 			if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
 			    pri == (unsigned char)*dst)
@@ -280,28 +251,41 @@ genrange(STR *s, int was_octal)
 	int stopval, octal;
 	char *savestart;
 	int n, cnt, *p;
+	size_t clen;
+	wchar_t wc;
 
 	octal = 0;
 	savestart = s->str;
-	stopval = *++s->str == '\\' ? backslash(s, &octal) : (u_char)*s->str++;
-	if (!octal)
-		octal = was_octal;
-
-	if ((octal && stopval < s->lastch) ||
-	    (!octal &&
-	     charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0)) {
-		s->str = savestart;
-		return (0);
+	if (*++s->str == '\\')
+		stopval = backslash(s, &octal);
+	else {
+		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
+		if (clen == (size_t)-1 || clen == (size_t)-2)
+			errc(1, EILSEQ, NULL);
+		stopval = wc;
+		s->str += clen;
 	}
-	if (octal) {
+	/*
+	 * XXX Characters are not ordered according to collating sequence in
+	 * multibyte locales.
+	 */
+	if (octal || was_octal || MB_CUR_MAX > 1) {
+		if (stopval < s->lastch) {
+			s->str = savestart;
+			return (0);
+		}
 		s->cnt = stopval - s->lastch + 1;
 		s->state = RANGE;
 		--s->lastch;
 		return (1);
 	}
-	if ((s->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
+	if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
+		s->str = savestart;
+		return (0);
+	}
+	if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
 		err(1, "genrange() malloc");
-	for (cnt = 0; cnt < NCHARS; cnt++)
+	for (cnt = 0; cnt < NCHARS_SB; cnt++)
 		if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
 		    charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
 			*p++ = cnt;
@@ -320,14 +304,21 @@ genseq(s)
 	STR *s;
 {
 	char *ep;
+	wchar_t wc;
+	size_t clen;
 
 	if (s->which == STRING1)
 		errx(1, "sequences only valid in string2");
 
 	if (*s->str == '\\')
 		s->lastch = backslash(s, NULL);
-	else
-		s->lastch = *s->str++;
+	else {
+		clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
+		if (clen == (size_t)-1 || clen == (size_t)-2)
+			errc(1, EILSEQ, NULL);
+		s->lastch = wc;
+		s->str += clen;
+	}
 	if (*s->str != '*')
 		errx(1, "misplaced sequence asterisk");
 
diff --git a/usr.bin/tr/tr.c b/usr.bin/tr/tr.c
index a22ba136dc8e..f3ce7ee56e25 100644
--- a/usr.bin/tr/tr.c
+++ b/usr.bin/tr/tr.c
@@ -49,67 +49,34 @@ static const char sccsid[] = "@(#)tr.c	8.2 (Berkeley) 5/4/95";
 
 #include <ctype.h>
 #include <err.h>
+#include <limits.h>
 #include <locale.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
+#include <wchar.h>
+#include <wctype.h>
 
+#include "cmap.h"
+#include "cset.h"
 #include "extern.h"
 
-/*
- * For -C option: determine whether a byte is a valid character in the
- * current character set (as defined by LC_CTYPE).
- */
-#define ISCHAR(c) (iscntrl(c) || isprint(c))
+STR s1 = { STRING1, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL };
+STR s2 = { STRING2, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL };
 
-static int string1[NCHARS] = {
-	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,		/* ASCII */
-	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
-	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
-	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
-	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
-	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
-	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
-	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
-	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
-	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
-	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
-	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
-	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
-	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
-	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
-	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
-	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
-	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
-	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
-	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
-	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
-	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
-	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
-	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
-	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
-	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
-	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
-	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
-	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
-	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
-	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
-}, string2[NCHARS];
-
-STR s1 = { STRING1, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
-STR s2 = { STRING2, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
-
-static void setup(int *, char *, STR *, int, int);
+static struct cset *setup(char *, STR *, int, int);
 static void usage(void);
 
 int
 main(int argc, char **argv)
 {
-	static int carray[NCHARS];
-	int ch, cnt, n, lastch, *p;
+	static int carray[NCHARS_SB];
+	struct cmap *map;
+	struct cset *delete, *squeeze;
+	int n, *p;
 	int Cflag, cflag, dflag, sflag, isstring2;
+	wint_t ch, cnt, i, lastch;
 
 	(void)setlocale(LC_ALL, "");
 
@@ -162,13 +129,14 @@ main(int argc, char **argv)
 		if (!isstring2)
 			usage();
 
-		setup(string1, argv[0], &s1, cflag, Cflag);
-		setup(string2, argv[1], &s2, 0, 0);
+		delete = setup(argv[0], &s1, cflag, Cflag);
+		squeeze = setup(argv[1], &s2, 0, 0);
 
-		for (lastch = OOBCH; (ch = getchar()) != EOF;)
-			if (!string1[ch] && (!string2[ch] || lastch != ch)) {
+		for (lastch = OOBCH; (ch = getwchar()) != WEOF;)
+			if (!cset_in(delete, ch) &&
+			    (lastch != ch || !cset_in(squeeze, ch))) {
 				lastch = ch;
-				(void)putchar(ch);
+				(void)putwchar(ch);
 			}
 		exit(0);
 	}
@@ -181,11 +149,11 @@ main(int argc, char **argv)
 		if (isstring2)
 			usage();
 
-		setup(string1, argv[0], &s1, cflag, Cflag);
+		delete = setup(argv[0], &s1, cflag, Cflag);
 
-		while ((ch = getchar()) != EOF)
-			if (!string1[ch])
-				(void)putchar(ch);
+		while ((ch = getwchar()) != WEOF)
+			if (!cset_in(delete, ch))
+				(void)putwchar(ch);
 		exit(0);
 	}
 
@@ -194,12 +162,12 @@ main(int argc, char **argv)
 	 * Squeeze all characters (or complemented characters) in string1.
 	 */
 	if (sflag && !isstring2) {
-		setup(string1, argv[0], &s1, cflag, Cflag);
+		squeeze = setup(argv[0], &s1, cflag, Cflag);
 
-		for (lastch = OOBCH; (ch = getchar()) != EOF;)
-			if (!string1[ch] || lastch != ch) {
+		for (lastch = OOBCH; (ch = getwchar()) != WEOF;)
+			if (lastch != ch || !cset_in(squeeze, ch)) {
 				lastch = ch;
-				(void)putchar(ch);
+				(void)putwchar(ch);
 			}
 		exit(0);
 	}
@@ -213,13 +181,19 @@ main(int argc, char **argv)
 	if (!isstring2)
 		usage();
 
+	map = cmap_alloc();
+	if (map == NULL)
+		err(1, NULL);
+	squeeze = cset_alloc();
+	if (squeeze == NULL)
+		err(1, NULL);
+
 	s1.str = argv[0];
-	if (cflag || Cflag) {
+
+	if (Cflag || cflag) {
+		cmap_default(map, OOBCH);
 		if ((s2.str = strdup(argv[1])) == NULL)
 			errx(1, "strdup(argv[1])");
-
-		for (cnt = NCHARS, p = string1; cnt--;)
-			*p++ = OOBCH;
 	} else
 		s2.str = argv[1];
 
@@ -235,52 +209,83 @@ main(int argc, char **argv)
 	/* If string2 runs out of characters, use the last one specified. */
 	while (next(&s1)) {
 	again:
-		if (s1.state == SET_LOWER &&
-		    s2.state == SET_UPPER &&
+		if (s1.state == CCLASS_LOWER &&
+		    s2.state == CCLASS_UPPER &&
 		    s1.cnt == 1 && s2.cnt == 1) {
 			do {
-				string1[s1.lastch] = ch = toupper(s1.lastch);
-				if (sflag && isupper(ch))
-					string2[ch] = 1;
+				ch = towupper(s1.lastch);
+				cmap_add(map, s1.lastch, ch);
+				if (sflag && iswupper(ch))
+					cset_add(squeeze, ch);
 				if (!next(&s1))
 					goto endloop;
-			} while (s1.state == SET_LOWER && s1.cnt > 1);
+			} while (s1.state == CCLASS_LOWER && s1.cnt > 1);
 			/* skip upper set */
 			do {
 				if (!next(&s2))
 					break;
-			} while (s2.state == SET_UPPER && s2.cnt > 1);
+			} while (s2.state == CCLASS_UPPER && s2.cnt > 1);
 			goto again;
-		} else if (s1.state == SET_UPPER &&
-			   s2.state == SET_LOWER &&
+		} else if (s1.state == CCLASS_UPPER &&
+			   s2.state == CCLASS_LOWER &&
 			   s1.cnt == 1 && s2.cnt == 1) {
 			do {
-				string1[s1.lastch] = ch = tolower(s1.lastch);
-				if (sflag && islower(ch))
-					string2[ch] = 1;
+				ch = towlower(s1.lastch);
+				cmap_add(map, s1.lastch, ch);
+				if (sflag && iswlower(ch))
+					cset_add(squeeze, ch);
 				if (!next(&s1))
 					goto endloop;
-			} while (s1.state == SET_UPPER && s1.cnt > 1);
+			} while (s1.state == CCLASS_UPPER && s1.cnt > 1);
 			/* skip lower set */
 			do {
 				if (!next(&s2))
 					break;
-			} while (s2.state == SET_LOWER && s2.cnt > 1);
+			} while (s2.state == CCLASS_LOWER && s2.cnt > 1);
 			goto again;
 		} else {
-			string1[s1.lastch] = s2.lastch;
+			cmap_add(map, s1.lastch, s2.lastch);
 			if (sflag)
-				string2[s2.lastch] = 1;
+				cset_add(squeeze, s2.lastch);
 		}
 		(void)next(&s2);
 	}
 endloop:
-	if (cflag || Cflag) {
-		for (p = carray, cnt = 0; cnt < NCHARS; cnt++) {
-			if (string1[cnt] == OOBCH && (!Cflag || ISCHAR(cnt)))
+	if (cflag || (Cflag && MB_CUR_MAX > 1)) {
+		/*
+		 * This is somewhat tricky: since the character set is
+		 * potentially huge, we need to avoid allocating a map
+		 * entry for every character. Our strategy is to set the
+		 * default mapping to the last character of string #2
+		 * (= the one that gets automatically repeated), then to
+		 * add back identity mappings for characters that should
+		 * remain unchanged. We don't waste space on identity mappings
+		 * for non-characters with the -C option; those are simulated
+		 * in the I/O loop.
+		 */
+		s2.str = argv[1];
+		s2.state = NORMAL;
+		for (cnt = 0; cnt < WCHAR_MAX; cnt++) {
+			if (Cflag && !iswrune(cnt))
+				continue;
+			if (cmap_lookup(map, cnt) == OOBCH) {
+				if (next(&s2))
+					cmap_add(map, cnt, s2.lastch);
+				if (sflag)
+					cset_add(squeeze, s2.lastch);
+			} else
+				cmap_add(map, cnt, cnt);
+			if ((s2.state == EOS || s2.state == INFINITE) &&
+			    cnt >= cmap_max(map))
+				break;
+		}
+		cmap_default(map, s2.lastch);
+	} else if (Cflag) {
+		for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) {
+			if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt))
 				*p++ = cnt;
 			else
-				string1[cnt] = cnt;
+				cmap_add(map, cnt, cnt);
 		}
 		n = p - carray;
 		if (Cflag && n > 1)
@@ -290,46 +295,55 @@ main(int argc, char **argv)
 		s2.state = NORMAL;
 		for (cnt = 0; cnt < n; cnt++) {
 			(void)next(&s2);
-			string1[carray[cnt]] = s2.lastch;
+			cmap_add(map, carray[cnt], s2.lastch);
 			/*
 			 * Chars taken from s2 can be different this time
 			 * due to lack of complex upper/lower processing,
 			 * so fill string2 again to not miss some.
 			 */
 			if (sflag)
-				string2[s2.lastch] = 1;
+				cset_add(squeeze, s2.lastch);
 		}
 	}
 
+	cset_cache(squeeze);
+	cmap_cache(map);
+
 	if (sflag)
-		for (lastch = OOBCH; (ch = getchar()) != EOF;) {
-			ch = string1[ch];
-			if (!string2[ch] || lastch != ch) {
+		for (lastch = OOBCH; (ch = getwchar()) != WEOF;) {
+			if (!Cflag || iswrune(ch))
+				ch = cmap_lookup(map, ch);
+			if (lastch != ch || !cset_in(squeeze, ch)) {
 				lastch = ch;
-				(void)putchar(ch);
+				(void)putwchar(ch);
 			}
 		}
 	else
-		while ((ch = getchar()) != EOF)
-			(void)putchar(string1[ch]);
+		while ((ch = getwchar()) != WEOF) {
+			if (!Cflag || iswrune(ch))
+				ch = cmap_lookup(map, ch);
+			(void)putwchar(ch);
+		}
 	exit (0);
 }
 
-static void
-setup(int *string, char *arg, STR *str, int cflag, int Cflag)
+static struct cset *
+setup(char *arg, STR *str, int cflag, int Cflag)
 {
-	int cnt, *p;
+	struct cset *cs;
 
+	cs = cset_alloc();
+	if (cs == NULL)
+		err(1, NULL);
 	str->str = arg;
-	bzero(string, NCHARS * sizeof(int));
 	while (next(str))
-		string[str->lastch] = 1;
-	if (cflag)
-		for (p = string, cnt = NCHARS; cnt--; ++p)
-			*p = !*p;
-	else if (Cflag)
-		for (cnt = 0; cnt < NCHARS; cnt++)
-			string[cnt] = !string[cnt] && ISCHAR(cnt);
+		cset_add(cs, str->lastch);
+	if (Cflag)
+		cset_addclass(cs, wctype("rune"), true);
+	if (cflag || Cflag)
+		cset_invert(cs);
+	cset_cache(cs);
+	return (cs);
 }
 
 int