From d3e5b0992d4508937c5801f98a39fa97c5407ac0 Mon Sep 17 00:00:00 2001 From: tjr Date: Sun, 27 Jun 2004 16:42:33 +0000 Subject: [PATCH] Handle multibyte characters when cutting out fields (-f and -d options.) --- usr.bin/cut/cut.1 | 10 +------- usr.bin/cut/cut.c | 65 +++++++++++++++++++++++++++++++++-------------- 2 files changed, 47 insertions(+), 28 deletions(-) diff --git a/usr.bin/cut/cut.1 b/usr.bin/cut/cut.1 index f978320657d7..7c99a006aa26 100644 --- a/usr.bin/cut/cut.1 +++ b/usr.bin/cut/cut.1 @@ -103,7 +103,7 @@ The .Ar list specifies character positions. .It Fl d Ar delim -Use the first character of +Use .Ar delim as the field delimiter character instead of the tab character. .It Fl f Ar list @@ -156,11 +156,3 @@ command appeared in .Tn AT&T System III .Ux . -.Sh BUGS -When operating on fields -.Fl ( f -option is specified), -.Nm -does not recognise multibyte characters, and the -.Ar delim -character is recognised in the middle of multibyte sequences. diff --git a/usr.bin/cut/cut.c b/usr.bin/cut/cut.c index 9d7cb50675b7..7b0cd41ba846 100644 --- a/usr.bin/cut/cut.c +++ b/usr.bin/cut/cut.c @@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -55,7 +56,8 @@ __FBSDID("$FreeBSD$"); int bflag; int cflag; -char dchar; +wchar_t dchar; +char dcharmb[MB_LEN_MAX + 1]; int dflag; int fflag; int nflag; @@ -75,11 +77,13 @@ main(int argc, char *argv[]) FILE *fp; int (*fcn)(FILE *, const char *); int ch, rval; + size_t n; setlocale(LC_ALL, ""); fcn = NULL; dchar = '\t'; /* default delimiter is \t */ + strcpy(dcharmb, "\t"); while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1) switch(ch) { @@ -92,7 +96,10 @@ main(int argc, char *argv[]) cflag = 1; break; case 'd': - dchar = *optarg; + n = mbrtowc(&dchar, optarg, MB_LEN_MAX, NULL); + if (dchar == '\0' || n != strlen(optarg)) + errx(1, "bad delimiter"); + strcpy(dcharmb, optarg); dflag = 1; break; case 'f': @@ -357,13 +364,15 @@ c_cut(FILE *fp, const char *fname) } int -f_cut(FILE *fp, const char *fname __unused) +f_cut(FILE *fp, const char *fname) { - int ch, field, isdelim; - char *pos, *p, sep; + wchar_t ch; + int field, i, isdelim; + char *pos, *p; + wchar_t sep; int output; char *lbuf, *mlbuf; - size_t lbuflen; + size_t clen, lbuflen; mlbuf = NULL; for (sep = dchar; (lbuf = fgetln(fp, &lbuflen)) != NULL;) { @@ -378,8 +387,15 @@ f_cut(FILE *fp, const char *fname __unused) lbuf = mlbuf; } output = 0; - for (isdelim = 0, p = lbuf;; ++p) { - ch = *p; + for (isdelim = 0, p = lbuf;; p += clen) { + clen = mbrtowc(&ch, p, lbuf + lbuflen - p, NULL); + if (clen == (size_t)-1 || clen == (size_t)-2) { + warnc(EILSEQ, "%s", fname); + free(mlbuf); + return (1); + } + if (clen == 0) + clen = 1; /* this should work if newline is delimiter */ if (ch == sep) isdelim = 1; @@ -394,14 +410,25 @@ f_cut(FILE *fp, const char *fname __unused) pos = positions + 1; for (field = maxval, p = lbuf; field; --field, ++pos) { - if (*pos) { - if (output++) - (void)putchar(sep); - while ((ch = *p++) != '\n' && ch != sep) - (void)putchar(ch); - } else { - while ((ch = *p++) != '\n' && ch != sep) - continue; + if (*pos && output++) + for (i = 0; dcharmb[i] != '\0'; i++) + putchar(dcharmb[i]); + for (;;) { + clen = mbrtowc(&ch, p, lbuf + lbuflen - p, + NULL); + if (clen == (size_t)-1 || clen == (size_t)-2) { + warnc(EILSEQ, "%s", fname); + free(mlbuf); + return (1); + } + if (clen == 0) + clen = 1; + p += clen; + if (ch == '\n' || ch == sep) + break; + if (*pos) + for (i = 0; i < (int)clen; i++) + putchar(p[i - clen]); } if (ch == '\n') break; @@ -409,7 +436,8 @@ f_cut(FILE *fp, const char *fname __unused) if (ch != '\n') { if (autostop) { if (output) - (void)putchar(sep); + for (i = 0; dcharmb[i] != '\0'; i++) + putchar(dcharmb[i]); for (; (ch = *p) != '\n'; ++p) (void)putchar(ch); } else @@ -417,8 +445,7 @@ f_cut(FILE *fp, const char *fname __unused) } (void)putchar('\n'); } - if (mlbuf != NULL) - free(mlbuf); + free(mlbuf); return (0); }