Add the -m option, which counts characters (as opposed to -c, which

counts bytes). In locales that don't have multibyte characters, -m
is effectively an alias for -c.

This brings wc(1) up to P1003.1-2001 conformance.
This commit is contained in:
tjr 2002-06-13 12:48:50 +00:00
parent 63759e6467
commit 1e238aa8c3
2 changed files with 83 additions and 24 deletions

View File

@ -40,10 +40,10 @@
.Os
.Sh NAME
.Nm wc
.Nd word, line, and byte count
.Nd word, line, character, and byte count
.Sh SYNOPSIS
.Nm
.Op Fl clw
.Op Fl clmw
.Op Ar
.Sh DESCRIPTION
The
@ -71,6 +71,12 @@ is written to the standard output.
.It Fl l
The number of lines in each input file
is written to the standard output.
.It Fl m
The number of characters in each input file is written to the standard output.
If the current locale does not support multibyte characters, this
is equivalent to the
.Fl c
option.
.It Fl w
The number of words in each input file
is written to the standard output.
@ -79,10 +85,36 @@ is written to the standard output.
When an option is specified,
.Nm
only reports the information requested by that option.
The default action is equivalent to specifying all of the flags.
The default action is equivalent to specifying the
.Fl c ,
.Fl l
and
.Fl w
options.
.Pp
If no files are specified, the standard input is used and no
file name is displayed.
.Sh ENVIRONMENT
The
.Ev LANG ,
.Ev LC_ALL
and
.Ev LC_CTYPE
environment variables affect the execution of
.Nm
as described in
.Xr environ 7
when the
.Fl m
option is specified.
.Sh EXAMPLES
Count the number of characters, words and lines in each of the files
.Pa report1
and
.Pa report2
as well as the totals for both:
.Pp
.Dl "wc -mlw report1 report2"
.Sh DIAGNOSTICS
.Ex -std
.Sh SEE ALSO
@ -108,7 +140,7 @@ function, as required by
The
.Nm
function conforms to
.St -p1003.2 .
.St -p1003.1-2001 .
.Sh HISTORY
A
.Nm

View File

@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <ctype.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <locale.h>
#include <stdint.h>
@ -60,7 +61,7 @@ __FBSDID("$FreeBSD$");
#include <unistd.h>
uintmax_t tlinect, twordct, tcharct;
int doline, doword, dochar;
int doline, doword, dochar, domulti;
static int cnt(const char *);
static void usage(void);
@ -74,7 +75,7 @@ main(argc, argv)
(void) setlocale(LC_CTYPE, "");
while ((ch = getopt(argc, argv, "lwc")) != -1)
while ((ch = getopt(argc, argv, "clmw")) != -1)
switch((char)ch) {
case 'l':
doline = 1;
@ -84,6 +85,11 @@ main(argc, argv)
break;
case 'c':
dochar = 1;
domulti = 0;
break;
case 'm':
domulti = 1;
dochar = 0;
break;
case '?':
default:
@ -93,7 +99,7 @@ main(argc, argv)
argc -= optind;
/* Wc's flags are on by default. */
if (doline + doword + dochar == 0)
if (doline + doword + dochar + domulti == 0)
doline = doword = dochar = 1;
errors = 0;
@ -117,7 +123,7 @@ main(argc, argv)
(void)printf(" %7ju", tlinect);
if (doword)
(void)printf(" %7ju", twordct);
if (dochar)
if (dochar || domulti)
(void)printf(" %7ju", tcharct);
(void)printf(" total\n");
}
@ -130,10 +136,12 @@ cnt(file)
{
struct stat sb;
uintmax_t linect, wordct, charct;
int fd, len;
ssize_t nread;
int clen, fd, len, warned;
short gotsp;
u_char *p;
u_char buf[MAXBSIZE], ch;
wchar_t wch;
linect = wordct = charct = 0;
if (file == NULL) {
@ -144,7 +152,7 @@ cnt(file)
warn("%s: open", file);
return (1);
}
if (doword)
if (doword || (domulti && MB_CUR_MAX != 1))
goto word;
/*
* Line counting is split out because it's a lot faster to get
@ -176,7 +184,7 @@ cnt(file)
* If all we need is the number of characters and it's a
* regular or linked file, just stat the puppy.
*/
if (dochar) {
if (dochar || domulti) {
if (fstat(fd, &sb)) {
warn("%s: fstat", file);
(void)close(fd);
@ -192,22 +200,41 @@ cnt(file)
}
/* Do it the hard way... */
word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
if (len == -1) {
word: gotsp = 1;
len = 0;
warned = 0;
while ((nread = read(fd, buf + len, MAXBSIZE - len)) != 0) {
if (nread == -1) {
warn("%s: read", file);
(void)close(fd);
return (1);
}
/*
* This loses in the presence of multi-byte characters.
* To do it right would require a function to return a
* character while knowing how many bytes it consumed.
*/
charct += len;
for (p = buf; len--;) {
ch = *p++;
if (ch == '\n')
len += nread;
p = buf;
while (len > 0) {
if (!domulti || MB_CUR_MAX == 1) {
clen = 1;
wch = (unsigned char)*p;
} else if ((clen = mbtowc(&wch, p, len)) <= 0) {
if (len > MB_CUR_MAX) {
clen = 1;
wch = (unsigned char)*p;
if (!warned) {
errno = EILSEQ;
warn("%s", file);
warned = 1;
}
} else {
memmove(buf, p, len);
break;
}
}
charct++;
len -= clen;
p += clen;
if (wch == L'\n')
++linect;
/* XXX Non-portable; should use iswspace() */
if (isspace(ch))
gotsp = 1;
else if (gotsp) {
@ -224,7 +251,7 @@ word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
twordct += wordct;
(void)printf(" %7ju", wordct);
}
if (dochar) {
if (dochar || domulti) {
tcharct += charct;
(void)printf(" %7ju", charct);
}
@ -235,6 +262,6 @@ word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
static void
usage()
{
(void)fprintf(stderr, "usage: wc [-clw] [file ...]\n");
(void)fprintf(stderr, "usage: wc [-clmw] [file ...]\n");
exit(1);
}