Handle multibyte characters when cutting out fields (-f and -d options.)

This commit is contained in:
Tim J. Robbins 2004-06-27 16:42:33 +00:00
parent 0dcb7b75a3
commit a5c4bafcd1
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=131197
2 changed files with 47 additions and 28 deletions

View File

@ -103,7 +103,7 @@ The
.Ar list .Ar list
specifies character positions. specifies character positions.
.It Fl d Ar delim .It Fl d Ar delim
Use the first character of Use
.Ar delim .Ar delim
as the field delimiter character instead of the tab character. as the field delimiter character instead of the tab character.
.It Fl f Ar list .It Fl f Ar list
@ -156,11 +156,3 @@ command appeared in
.Tn AT&T .Tn AT&T
System III System III
.Ux . .Ux .
.Sh BUGS
When operating on fields
.Fl ( f
option is specified),
.Nm
does not recognise multibyte characters, and the
.Ar delim
character is recognised in the middle of multibyte sequences.

View File

@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <ctype.h> #include <ctype.h>
#include <err.h> #include <err.h>
#include <errno.h>
#include <limits.h> #include <limits.h>
#include <locale.h> #include <locale.h>
#include <stdio.h> #include <stdio.h>
@ -55,7 +56,8 @@ __FBSDID("$FreeBSD$");
int bflag; int bflag;
int cflag; int cflag;
char dchar; wchar_t dchar;
char dcharmb[MB_LEN_MAX + 1];
int dflag; int dflag;
int fflag; int fflag;
int nflag; int nflag;
@ -75,11 +77,13 @@ main(int argc, char *argv[])
FILE *fp; FILE *fp;
int (*fcn)(FILE *, const char *); int (*fcn)(FILE *, const char *);
int ch, rval; int ch, rval;
size_t n;
setlocale(LC_ALL, ""); setlocale(LC_ALL, "");
fcn = NULL; fcn = NULL;
dchar = '\t'; /* default delimiter is \t */ dchar = '\t'; /* default delimiter is \t */
strcpy(dcharmb, "\t");
while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1) while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1)
switch(ch) { switch(ch) {
@ -92,7 +96,10 @@ main(int argc, char *argv[])
cflag = 1; cflag = 1;
break; break;
case 'd': case 'd':
dchar = *optarg; n = mbrtowc(&dchar, optarg, MB_LEN_MAX, NULL);
if (dchar == '\0' || n != strlen(optarg))
errx(1, "bad delimiter");
strcpy(dcharmb, optarg);
dflag = 1; dflag = 1;
break; break;
case 'f': case 'f':
@ -357,13 +364,15 @@ c_cut(FILE *fp, const char *fname)
} }
int int
f_cut(FILE *fp, const char *fname __unused) f_cut(FILE *fp, const char *fname)
{ {
int ch, field, isdelim; wchar_t ch;
char *pos, *p, sep; int field, i, isdelim;
char *pos, *p;
wchar_t sep;
int output; int output;
char *lbuf, *mlbuf; char *lbuf, *mlbuf;
size_t lbuflen; size_t clen, lbuflen;
mlbuf = NULL; mlbuf = NULL;
for (sep = dchar; (lbuf = fgetln(fp, &lbuflen)) != NULL;) { for (sep = dchar; (lbuf = fgetln(fp, &lbuflen)) != NULL;) {
@ -378,8 +387,15 @@ f_cut(FILE *fp, const char *fname __unused)
lbuf = mlbuf; lbuf = mlbuf;
} }
output = 0; output = 0;
for (isdelim = 0, p = lbuf;; ++p) { for (isdelim = 0, p = lbuf;; p += clen) {
ch = *p; clen = mbrtowc(&ch, p, lbuf + lbuflen - p, NULL);
if (clen == (size_t)-1 || clen == (size_t)-2) {
warnc(EILSEQ, "%s", fname);
free(mlbuf);
return (1);
}
if (clen == 0)
clen = 1;
/* this should work if newline is delimiter */ /* this should work if newline is delimiter */
if (ch == sep) if (ch == sep)
isdelim = 1; isdelim = 1;
@ -394,14 +410,25 @@ f_cut(FILE *fp, const char *fname __unused)
pos = positions + 1; pos = positions + 1;
for (field = maxval, p = lbuf; field; --field, ++pos) { for (field = maxval, p = lbuf; field; --field, ++pos) {
if (*pos) { if (*pos && output++)
if (output++) for (i = 0; dcharmb[i] != '\0'; i++)
(void)putchar(sep); putchar(dcharmb[i]);
while ((ch = *p++) != '\n' && ch != sep) for (;;) {
(void)putchar(ch); clen = mbrtowc(&ch, p, lbuf + lbuflen - p,
} else { NULL);
while ((ch = *p++) != '\n' && ch != sep) if (clen == (size_t)-1 || clen == (size_t)-2) {
continue; warnc(EILSEQ, "%s", fname);
free(mlbuf);
return (1);
}
if (clen == 0)
clen = 1;
p += clen;
if (ch == '\n' || ch == sep)
break;
if (*pos)
for (i = 0; i < (int)clen; i++)
putchar(p[i - clen]);
} }
if (ch == '\n') if (ch == '\n')
break; break;
@ -409,7 +436,8 @@ f_cut(FILE *fp, const char *fname __unused)
if (ch != '\n') { if (ch != '\n') {
if (autostop) { if (autostop) {
if (output) if (output)
(void)putchar(sep); for (i = 0; dcharmb[i] != '\0'; i++)
putchar(dcharmb[i]);
for (; (ch = *p) != '\n'; ++p) for (; (ch = *p) != '\n'; ++p)
(void)putchar(ch); (void)putchar(ch);
} else } else
@ -417,8 +445,7 @@ f_cut(FILE *fp, const char *fname __unused)
} }
(void)putchar('\n'); (void)putchar('\n');
} }
if (mlbuf != NULL) free(mlbuf);
free(mlbuf);
return (0); return (0);
} }