Handle multibyte characters when cutting out fields (-f and -d options.)

This commit is contained in:
tjr 2004-06-27 16:42:33 +00:00
parent 02b0649fa8
commit d3e5b0992d
2 changed files with 47 additions and 28 deletions

View File

@ -103,7 +103,7 @@ The
.Ar list
specifies character positions.
.It Fl d Ar delim
Use the first character of
Use
.Ar delim
as the field delimiter character instead of the tab character.
.It Fl f Ar list
@ -156,11 +156,3 @@ command appeared in
.Tn AT&T
System III
.Ux .
.Sh BUGS
When operating on fields
.Fl ( f
option is specified),
.Nm
does not recognise multibyte characters, and the
.Ar delim
character is recognised in the middle of multibyte sequences.

View File

@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <locale.h>
#include <stdio.h>
@ -55,7 +56,8 @@ __FBSDID("$FreeBSD$");
int bflag;
int cflag;
char dchar;
wchar_t dchar;
char dcharmb[MB_LEN_MAX + 1];
int dflag;
int fflag;
int nflag;
@ -75,11 +77,13 @@ main(int argc, char *argv[])
FILE *fp;
int (*fcn)(FILE *, const char *);
int ch, rval;
size_t n;
setlocale(LC_ALL, "");
fcn = NULL;
dchar = '\t'; /* default delimiter is \t */
strcpy(dcharmb, "\t");
while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1)
switch(ch) {
@ -92,7 +96,10 @@ main(int argc, char *argv[])
cflag = 1;
break;
case 'd':
dchar = *optarg;
n = mbrtowc(&dchar, optarg, MB_LEN_MAX, NULL);
if (dchar == '\0' || n != strlen(optarg))
errx(1, "bad delimiter");
strcpy(dcharmb, optarg);
dflag = 1;
break;
case 'f':
@ -357,13 +364,15 @@ c_cut(FILE *fp, const char *fname)
}
int
f_cut(FILE *fp, const char *fname __unused)
f_cut(FILE *fp, const char *fname)
{
int ch, field, isdelim;
char *pos, *p, sep;
wchar_t ch;
int field, i, isdelim;
char *pos, *p;
wchar_t sep;
int output;
char *lbuf, *mlbuf;
size_t lbuflen;
size_t clen, lbuflen;
mlbuf = NULL;
for (sep = dchar; (lbuf = fgetln(fp, &lbuflen)) != NULL;) {
@ -378,8 +387,15 @@ f_cut(FILE *fp, const char *fname __unused)
lbuf = mlbuf;
}
output = 0;
for (isdelim = 0, p = lbuf;; ++p) {
ch = *p;
for (isdelim = 0, p = lbuf;; p += clen) {
clen = mbrtowc(&ch, p, lbuf + lbuflen - p, NULL);
if (clen == (size_t)-1 || clen == (size_t)-2) {
warnc(EILSEQ, "%s", fname);
free(mlbuf);
return (1);
}
if (clen == 0)
clen = 1;
/* this should work if newline is delimiter */
if (ch == sep)
isdelim = 1;
@ -394,14 +410,25 @@ f_cut(FILE *fp, const char *fname __unused)
pos = positions + 1;
for (field = maxval, p = lbuf; field; --field, ++pos) {
if (*pos) {
if (output++)
(void)putchar(sep);
while ((ch = *p++) != '\n' && ch != sep)
(void)putchar(ch);
} else {
while ((ch = *p++) != '\n' && ch != sep)
continue;
if (*pos && output++)
for (i = 0; dcharmb[i] != '\0'; i++)
putchar(dcharmb[i]);
for (;;) {
clen = mbrtowc(&ch, p, lbuf + lbuflen - p,
NULL);
if (clen == (size_t)-1 || clen == (size_t)-2) {
warnc(EILSEQ, "%s", fname);
free(mlbuf);
return (1);
}
if (clen == 0)
clen = 1;
p += clen;
if (ch == '\n' || ch == sep)
break;
if (*pos)
for (i = 0; i < (int)clen; i++)
putchar(p[i - clen]);
}
if (ch == '\n')
break;
@ -409,7 +436,8 @@ f_cut(FILE *fp, const char *fname __unused)
if (ch != '\n') {
if (autostop) {
if (output)
(void)putchar(sep);
for (i = 0; dcharmb[i] != '\0'; i++)
putchar(dcharmb[i]);
for (; (ch = *p) != '\n'; ++p)
(void)putchar(ch);
} else
@ -417,8 +445,7 @@ f_cut(FILE *fp, const char *fname __unused)
}
(void)putchar('\n');
}
if (mlbuf != NULL)
free(mlbuf);
free(mlbuf);
return (0);
}