Make the 'y' (translate) command aware of multibyte characters.
This commit is contained in:
parent
084c37915e
commit
b7f5e217dd
@ -47,12 +47,14 @@ static const char sccsid[] = "@(#)compile.c 8.1 (Berkeley) 6/6/93";
|
||||
|
||||
#include <ctype.h>
|
||||
#include <err.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <limits.h>
|
||||
#include <regex.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <wchar.h>
|
||||
|
||||
#include "defs.h"
|
||||
#include "extern.h"
|
||||
@ -73,7 +75,7 @@ static char *compile_flags(char *, struct s_subst *);
|
||||
static char *compile_re(char *, regex_t **);
|
||||
static char *compile_subst(char *, struct s_subst *);
|
||||
static char *compile_text(void);
|
||||
static char *compile_tr(char *, char **);
|
||||
static char *compile_tr(char *, struct s_tr **);
|
||||
static struct s_command
|
||||
**compile_stream(struct s_command **);
|
||||
static char *duptoeol(char *, const char *);
|
||||
@ -337,7 +339,7 @@ nonsel: /* Now parse the command */
|
||||
break;
|
||||
case TR: /* y */
|
||||
p++;
|
||||
p = compile_tr(p, (char **)&cmd->u.y);
|
||||
p = compile_tr(p, &cmd->u.y);
|
||||
EATSPACE();
|
||||
if (*p == ';') {
|
||||
p++;
|
||||
@ -619,12 +621,20 @@ compile_flags(char *p, struct s_subst *s)
|
||||
* Compile a translation set of strings into a lookup table.
|
||||
*/
|
||||
static char *
|
||||
compile_tr(char *p, char **transtab)
|
||||
compile_tr(char *p, struct s_tr **py)
|
||||
{
|
||||
struct s_tr *y;
|
||||
int i;
|
||||
char *lt, *op, *np;
|
||||
const char *op, *np;
|
||||
char old[_POSIX2_LINE_MAX + 1];
|
||||
char new[_POSIX2_LINE_MAX + 1];
|
||||
size_t oclen, oldlen, nclen, newlen;
|
||||
mbstate_t mbs1, mbs2;
|
||||
|
||||
if ((*py = y = malloc(sizeof(*y))) == NULL)
|
||||
err(1, NULL);
|
||||
y->multis = NULL;
|
||||
y->nmultis = 0;
|
||||
|
||||
if (*p == '\0' || *p == '\\')
|
||||
errx(1,
|
||||
@ -639,17 +649,63 @@ compile_tr(char *p, char **transtab)
|
||||
errx(1, "%lu: %s: unterminated transform target string",
|
||||
linenum, fname);
|
||||
EATSPACE();
|
||||
if (strlen(new) != strlen(old))
|
||||
op = old;
|
||||
oldlen = mbsrtowcs(NULL, &op, 0, NULL);
|
||||
if (oldlen == (size_t)-1)
|
||||
err(1, NULL);
|
||||
np = new;
|
||||
newlen = mbsrtowcs(NULL, &np, 0, NULL);
|
||||
if (newlen == (size_t)-1)
|
||||
err(1, NULL);
|
||||
if (newlen != oldlen)
|
||||
errx(1, "%lu: %s: transform strings are not the same length",
|
||||
linenum, fname);
|
||||
/* We assume characters are 8 bits */
|
||||
if ((lt = malloc(UCHAR_MAX)) == NULL)
|
||||
err(1, "malloc");
|
||||
for (i = 0; i <= UCHAR_MAX; i++)
|
||||
lt[i] = (char)i;
|
||||
for (op = old, np = new; *op; op++, np++)
|
||||
lt[(u_char)*op] = *np;
|
||||
*transtab = lt;
|
||||
if (MB_CUR_MAX == 1) {
|
||||
/*
|
||||
* The single-byte encoding case is easy: generate a
|
||||
* lookup table.
|
||||
*/
|
||||
for (i = 0; i <= UCHAR_MAX; i++)
|
||||
y->bytetab[i] = (char)i;
|
||||
for (; *op; op++, np++)
|
||||
y->bytetab[(u_char)*op] = *np;
|
||||
} else {
|
||||
/*
|
||||
* Multi-byte encoding case: generate a lookup table as
|
||||
* above, but only for single-byte characters. The first
|
||||
* bytes of multi-byte characters have their lookup table
|
||||
* entries set to 0, which causes do_tr() to search through
|
||||
* an auxiliary vector of multi-byte mappings.
|
||||
*/
|
||||
memset(&mbs1, 0, sizeof(mbs1));
|
||||
memset(&mbs2, 0, sizeof(mbs2));
|
||||
for (i = 0; i <= UCHAR_MAX; i++)
|
||||
y->bytetab[i] = (btowc(i) != WEOF) ? i : 0;
|
||||
while (*op != '\0') {
|
||||
oclen = mbrlen(op, MB_LEN_MAX, &mbs1);
|
||||
if (oclen == (size_t)-1 || oclen == (size_t)-2)
|
||||
errc(1, EILSEQ, NULL);
|
||||
nclen = mbrlen(np, MB_LEN_MAX, &mbs2);
|
||||
if (nclen == (size_t)-1 || nclen == (size_t)-2)
|
||||
errc(1, EILSEQ, NULL);
|
||||
if (oclen == 1 && nclen == 1)
|
||||
y->bytetab[(u_char)*op] = *np;
|
||||
else {
|
||||
y->bytetab[(u_char)*op] = 0;
|
||||
y->multis = realloc(y->multis,
|
||||
(y->nmultis + 1) * sizeof(*y->multis));
|
||||
if (y->multis == NULL)
|
||||
err(1, NULL);
|
||||
i = y->nmultis++;
|
||||
y->multis[i].fromlen = oclen;
|
||||
memcpy(y->multis[i].from, op, oclen);
|
||||
y->multis[i].tolen = nclen;
|
||||
memcpy(y->multis[i].to, np, nclen);
|
||||
}
|
||||
op += oclen;
|
||||
np += nclen;
|
||||
}
|
||||
}
|
||||
return (p);
|
||||
}
|
||||
|
||||
|
@ -35,6 +35,7 @@
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)defs.h 8.1 (Berkeley) 6/6/93
|
||||
* $FreeBSD$
|
||||
*/
|
||||
|
||||
/*
|
||||
@ -71,6 +72,19 @@ struct s_subst {
|
||||
char *new; /* Replacement text */
|
||||
};
|
||||
|
||||
/*
|
||||
* Translate command.
|
||||
*/
|
||||
struct s_tr {
|
||||
unsigned char bytetab[256];
|
||||
struct trmulti {
|
||||
int fromlen;
|
||||
char from[MB_LEN_MAX];
|
||||
int tolen;
|
||||
char to[MB_LEN_MAX];
|
||||
} *multis;
|
||||
int nmultis;
|
||||
};
|
||||
|
||||
/*
|
||||
* An internally compiled command.
|
||||
@ -84,7 +98,7 @@ struct s_command {
|
||||
union {
|
||||
struct s_command *c; /* Command(s) for b t { */
|
||||
struct s_subst *s; /* Substitute command */
|
||||
u_char *y; /* Replace command array */
|
||||
struct s_tr *y; /* Replace command array */
|
||||
int fd; /* File descriptor for w */
|
||||
} u;
|
||||
char code; /* Command code */
|
||||
|
@ -56,6 +56,7 @@ static const char sccsid[] = "@(#)main.c 8.2 (Berkeley) 1/3/94";
|
||||
#include <err.h>
|
||||
#include <errno.h>
|
||||
#include <fcntl.h>
|
||||
#include <limits.h>
|
||||
#include <locale.h>
|
||||
#include <regex.h>
|
||||
#include <stddef.h>
|
||||
|
@ -45,6 +45,7 @@ static const char sccsid[] = "@(#)misc.c 8.1 (Berkeley) 6/6/93";
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <err.h>
|
||||
#include <limits.h>
|
||||
#include <regex.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -63,7 +63,7 @@ static const char sccsid[] = "@(#)process.c 8.6 (Berkeley) 4/20/94";
|
||||
#include "defs.h"
|
||||
#include "extern.h"
|
||||
|
||||
static SPACE HS, PS, SS;
|
||||
static SPACE HS, PS, SS, YS;
|
||||
#define pd PS.deleted
|
||||
#define ps PS.space
|
||||
#define psl PS.len
|
||||
@ -71,6 +71,7 @@ static SPACE HS, PS, SS;
|
||||
#define hsl HS.len
|
||||
|
||||
static __inline int applies(struct s_command *);
|
||||
static void do_tr(struct s_tr *);
|
||||
static void flush_appends(void);
|
||||
static void lputs(char *, size_t);
|
||||
static __inline int regexec_e(regex_t *, const char *, int, int, size_t);
|
||||
@ -97,6 +98,7 @@ process(void)
|
||||
SPACE tspace;
|
||||
size_t len, oldpsl = 0;
|
||||
char *p;
|
||||
char nc;
|
||||
|
||||
p = NULL;
|
||||
|
||||
@ -247,8 +249,7 @@ redirect:
|
||||
case 'y':
|
||||
if (pd || psl == 0)
|
||||
break;
|
||||
for (p = ps, len = psl; len--; ++p)
|
||||
*p = cp->u.y[(unsigned char)*p];
|
||||
do_tr(cp->u.y);
|
||||
break;
|
||||
case ':':
|
||||
case '}':
|
||||
@ -425,6 +426,61 @@ substitute(struct s_command *cp)
|
||||
return (1);
|
||||
}
|
||||
|
||||
/*
|
||||
* do_tr --
|
||||
* Perform translation ('y' command) in the pattern space.
|
||||
*/
|
||||
static void
|
||||
do_tr(struct s_tr *y)
|
||||
{
|
||||
SPACE tmp;
|
||||
char c, *p;
|
||||
size_t clen, left;
|
||||
int i;
|
||||
|
||||
if (MB_CUR_MAX == 1) {
|
||||
/*
|
||||
* Single-byte encoding: perform in-place translation
|
||||
* of the pattern space.
|
||||
*/
|
||||
for (p = ps; p < &ps[psl]; p++)
|
||||
*p = y->bytetab[(u_char)*p];
|
||||
} else {
|
||||
/*
|
||||
* Multi-byte encoding: perform translation into the
|
||||
* translation space, then swap the translation and
|
||||
* pattern spaces.
|
||||
*/
|
||||
/* Clean translation space. */
|
||||
YS.len = 0;
|
||||
for (p = ps, left = psl; left > 0; p += clen, left -= clen) {
|
||||
if ((c = y->bytetab[(u_char)*p]) != '\0') {
|
||||
cspace(&YS, &c, 1, APPEND);
|
||||
clen = 1;
|
||||
continue;
|
||||
}
|
||||
for (i = 0; i < y->nmultis; i++)
|
||||
if (left >= y->multis[i].fromlen &&
|
||||
memcmp(p, y->multis[i].from,
|
||||
y->multis[i].fromlen) == 0)
|
||||
break;
|
||||
if (i < y->nmultis) {
|
||||
cspace(&YS, y->multis[i].to,
|
||||
y->multis[i].tolen, APPEND);
|
||||
clen = y->multis[i].fromlen;
|
||||
} else {
|
||||
cspace(&YS, p, 1, APPEND);
|
||||
clen = 1;
|
||||
}
|
||||
}
|
||||
/* Swap the translation space and the pattern space. */
|
||||
tmp = PS;
|
||||
PS = YS;
|
||||
YS = tmp;
|
||||
YS.space = YS.back;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush append requests. Always called before reading a line,
|
||||
* therefore it also resets the substitution done (sdone) flag.
|
||||
|
Loading…
x
Reference in New Issue
Block a user