Handle multibyte characters when cutting out fields (-f and -d options.)
This commit is contained in:
parent
0dcb7b75a3
commit
a5c4bafcd1
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=131197
@ -103,7 +103,7 @@ The
|
|||||||
.Ar list
|
.Ar list
|
||||||
specifies character positions.
|
specifies character positions.
|
||||||
.It Fl d Ar delim
|
.It Fl d Ar delim
|
||||||
Use the first character of
|
Use
|
||||||
.Ar delim
|
.Ar delim
|
||||||
as the field delimiter character instead of the tab character.
|
as the field delimiter character instead of the tab character.
|
||||||
.It Fl f Ar list
|
.It Fl f Ar list
|
||||||
@ -156,11 +156,3 @@ command appeared in
|
|||||||
.Tn AT&T
|
.Tn AT&T
|
||||||
System III
|
System III
|
||||||
.Ux .
|
.Ux .
|
||||||
.Sh BUGS
|
|
||||||
When operating on fields
|
|
||||||
.Fl ( f
|
|
||||||
option is specified),
|
|
||||||
.Nm
|
|
||||||
does not recognise multibyte characters, and the
|
|
||||||
.Ar delim
|
|
||||||
character is recognised in the middle of multibyte sequences.
|
|
||||||
|
@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
|
|||||||
|
|
||||||
#include <ctype.h>
|
#include <ctype.h>
|
||||||
#include <err.h>
|
#include <err.h>
|
||||||
|
#include <errno.h>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
#include <locale.h>
|
#include <locale.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
@ -55,7 +56,8 @@ __FBSDID("$FreeBSD$");
|
|||||||
|
|
||||||
int bflag;
|
int bflag;
|
||||||
int cflag;
|
int cflag;
|
||||||
char dchar;
|
wchar_t dchar;
|
||||||
|
char dcharmb[MB_LEN_MAX + 1];
|
||||||
int dflag;
|
int dflag;
|
||||||
int fflag;
|
int fflag;
|
||||||
int nflag;
|
int nflag;
|
||||||
@ -75,11 +77,13 @@ main(int argc, char *argv[])
|
|||||||
FILE *fp;
|
FILE *fp;
|
||||||
int (*fcn)(FILE *, const char *);
|
int (*fcn)(FILE *, const char *);
|
||||||
int ch, rval;
|
int ch, rval;
|
||||||
|
size_t n;
|
||||||
|
|
||||||
setlocale(LC_ALL, "");
|
setlocale(LC_ALL, "");
|
||||||
|
|
||||||
fcn = NULL;
|
fcn = NULL;
|
||||||
dchar = '\t'; /* default delimiter is \t */
|
dchar = '\t'; /* default delimiter is \t */
|
||||||
|
strcpy(dcharmb, "\t");
|
||||||
|
|
||||||
while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1)
|
while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1)
|
||||||
switch(ch) {
|
switch(ch) {
|
||||||
@ -92,7 +96,10 @@ main(int argc, char *argv[])
|
|||||||
cflag = 1;
|
cflag = 1;
|
||||||
break;
|
break;
|
||||||
case 'd':
|
case 'd':
|
||||||
dchar = *optarg;
|
n = mbrtowc(&dchar, optarg, MB_LEN_MAX, NULL);
|
||||||
|
if (dchar == '\0' || n != strlen(optarg))
|
||||||
|
errx(1, "bad delimiter");
|
||||||
|
strcpy(dcharmb, optarg);
|
||||||
dflag = 1;
|
dflag = 1;
|
||||||
break;
|
break;
|
||||||
case 'f':
|
case 'f':
|
||||||
@ -357,13 +364,15 @@ c_cut(FILE *fp, const char *fname)
|
|||||||
}
|
}
|
||||||
|
|
||||||
int
|
int
|
||||||
f_cut(FILE *fp, const char *fname __unused)
|
f_cut(FILE *fp, const char *fname)
|
||||||
{
|
{
|
||||||
int ch, field, isdelim;
|
wchar_t ch;
|
||||||
char *pos, *p, sep;
|
int field, i, isdelim;
|
||||||
|
char *pos, *p;
|
||||||
|
wchar_t sep;
|
||||||
int output;
|
int output;
|
||||||
char *lbuf, *mlbuf;
|
char *lbuf, *mlbuf;
|
||||||
size_t lbuflen;
|
size_t clen, lbuflen;
|
||||||
|
|
||||||
mlbuf = NULL;
|
mlbuf = NULL;
|
||||||
for (sep = dchar; (lbuf = fgetln(fp, &lbuflen)) != NULL;) {
|
for (sep = dchar; (lbuf = fgetln(fp, &lbuflen)) != NULL;) {
|
||||||
@ -378,8 +387,15 @@ f_cut(FILE *fp, const char *fname __unused)
|
|||||||
lbuf = mlbuf;
|
lbuf = mlbuf;
|
||||||
}
|
}
|
||||||
output = 0;
|
output = 0;
|
||||||
for (isdelim = 0, p = lbuf;; ++p) {
|
for (isdelim = 0, p = lbuf;; p += clen) {
|
||||||
ch = *p;
|
clen = mbrtowc(&ch, p, lbuf + lbuflen - p, NULL);
|
||||||
|
if (clen == (size_t)-1 || clen == (size_t)-2) {
|
||||||
|
warnc(EILSEQ, "%s", fname);
|
||||||
|
free(mlbuf);
|
||||||
|
return (1);
|
||||||
|
}
|
||||||
|
if (clen == 0)
|
||||||
|
clen = 1;
|
||||||
/* this should work if newline is delimiter */
|
/* this should work if newline is delimiter */
|
||||||
if (ch == sep)
|
if (ch == sep)
|
||||||
isdelim = 1;
|
isdelim = 1;
|
||||||
@ -394,14 +410,25 @@ f_cut(FILE *fp, const char *fname __unused)
|
|||||||
|
|
||||||
pos = positions + 1;
|
pos = positions + 1;
|
||||||
for (field = maxval, p = lbuf; field; --field, ++pos) {
|
for (field = maxval, p = lbuf; field; --field, ++pos) {
|
||||||
if (*pos) {
|
if (*pos && output++)
|
||||||
if (output++)
|
for (i = 0; dcharmb[i] != '\0'; i++)
|
||||||
(void)putchar(sep);
|
putchar(dcharmb[i]);
|
||||||
while ((ch = *p++) != '\n' && ch != sep)
|
for (;;) {
|
||||||
(void)putchar(ch);
|
clen = mbrtowc(&ch, p, lbuf + lbuflen - p,
|
||||||
} else {
|
NULL);
|
||||||
while ((ch = *p++) != '\n' && ch != sep)
|
if (clen == (size_t)-1 || clen == (size_t)-2) {
|
||||||
continue;
|
warnc(EILSEQ, "%s", fname);
|
||||||
|
free(mlbuf);
|
||||||
|
return (1);
|
||||||
|
}
|
||||||
|
if (clen == 0)
|
||||||
|
clen = 1;
|
||||||
|
p += clen;
|
||||||
|
if (ch == '\n' || ch == sep)
|
||||||
|
break;
|
||||||
|
if (*pos)
|
||||||
|
for (i = 0; i < (int)clen; i++)
|
||||||
|
putchar(p[i - clen]);
|
||||||
}
|
}
|
||||||
if (ch == '\n')
|
if (ch == '\n')
|
||||||
break;
|
break;
|
||||||
@ -409,7 +436,8 @@ f_cut(FILE *fp, const char *fname __unused)
|
|||||||
if (ch != '\n') {
|
if (ch != '\n') {
|
||||||
if (autostop) {
|
if (autostop) {
|
||||||
if (output)
|
if (output)
|
||||||
(void)putchar(sep);
|
for (i = 0; dcharmb[i] != '\0'; i++)
|
||||||
|
putchar(dcharmb[i]);
|
||||||
for (; (ch = *p) != '\n'; ++p)
|
for (; (ch = *p) != '\n'; ++p)
|
||||||
(void)putchar(ch);
|
(void)putchar(ch);
|
||||||
} else
|
} else
|
||||||
@ -417,8 +445,7 @@ f_cut(FILE *fp, const char *fname __unused)
|
|||||||
}
|
}
|
||||||
(void)putchar('\n');
|
(void)putchar('\n');
|
||||||
}
|
}
|
||||||
if (mlbuf != NULL)
|
free(mlbuf);
|
||||||
free(mlbuf);
|
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user