diff --git a/usr.bin/wc/wc.1 b/usr.bin/wc/wc.1 index fc535e2693a0..92abd0f03ad4 100644 --- a/usr.bin/wc/wc.1 +++ b/usr.bin/wc/wc.1 @@ -40,10 +40,10 @@ .Os .Sh NAME .Nm wc -.Nd word, line, and byte count +.Nd word, line, character, and byte count .Sh SYNOPSIS .Nm -.Op Fl clw +.Op Fl clmw .Op Ar .Sh DESCRIPTION The @@ -71,6 +71,12 @@ is written to the standard output. .It Fl l The number of lines in each input file is written to the standard output. +.It Fl m +The number of characters in each input file is written to the standard output. +If the current locale does not support multibyte characters, this +is equivalent to the +.Fl c +option. .It Fl w The number of words in each input file is written to the standard output. @@ -79,10 +85,36 @@ is written to the standard output. When an option is specified, .Nm only reports the information requested by that option. -The default action is equivalent to specifying all of the flags. +The default action is equivalent to specifying the +.Fl c , +.Fl l +and +.Fl w +options. .Pp If no files are specified, the standard input is used and no file name is displayed. +.Sh ENVIRONMENT +The +.Ev LANG , +.Ev LC_ALL +and +.Ev LC_CTYPE +environment variables affect the execution of +.Nm +as described in +.Xr environ 7 +when the +.Fl m +option is specified. +.Sh EXAMPLES +Count the number of characters, words and lines in each of the files +.Pa report1 +and +.Pa report2 +as well as the totals for both: +.Pp +.Dl "wc -mlw report1 report2" .Sh DIAGNOSTICS .Ex -std .Sh SEE ALSO @@ -108,7 +140,7 @@ function, as required by The .Nm function conforms to -.St -p1003.2 . +.St -p1003.1-2001 . .Sh HISTORY A .Nm diff --git a/usr.bin/wc/wc.c b/usr.bin/wc/wc.c index d70d94093920..cacc08e03ef6 100644 --- a/usr.bin/wc/wc.c +++ b/usr.bin/wc/wc.c @@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -60,7 +61,7 @@ __FBSDID("$FreeBSD$"); #include uintmax_t tlinect, twordct, tcharct; -int doline, doword, dochar; +int doline, doword, dochar, domulti; static int cnt(const char *); static void usage(void); @@ -74,7 +75,7 @@ main(argc, argv) (void) setlocale(LC_CTYPE, ""); - while ((ch = getopt(argc, argv, "lwc")) != -1) + while ((ch = getopt(argc, argv, "clmw")) != -1) switch((char)ch) { case 'l': doline = 1; @@ -84,6 +85,11 @@ main(argc, argv) break; case 'c': dochar = 1; + domulti = 0; + break; + case 'm': + domulti = 1; + dochar = 0; break; case '?': default: @@ -93,7 +99,7 @@ main(argc, argv) argc -= optind; /* Wc's flags are on by default. */ - if (doline + doword + dochar == 0) + if (doline + doword + dochar + domulti == 0) doline = doword = dochar = 1; errors = 0; @@ -117,7 +123,7 @@ main(argc, argv) (void)printf(" %7ju", tlinect); if (doword) (void)printf(" %7ju", twordct); - if (dochar) + if (dochar || domulti) (void)printf(" %7ju", tcharct); (void)printf(" total\n"); } @@ -130,10 +136,12 @@ cnt(file) { struct stat sb; uintmax_t linect, wordct, charct; - int fd, len; + ssize_t nread; + int clen, fd, len, warned; short gotsp; u_char *p; u_char buf[MAXBSIZE], ch; + wchar_t wch; linect = wordct = charct = 0; if (file == NULL) { @@ -144,7 +152,7 @@ cnt(file) warn("%s: open", file); return (1); } - if (doword) + if (doword || (domulti && MB_CUR_MAX != 1)) goto word; /* * Line counting is split out because it's a lot faster to get @@ -176,7 +184,7 @@ cnt(file) * If all we need is the number of characters and it's a * regular or linked file, just stat the puppy. */ - if (dochar) { + if (dochar || domulti) { if (fstat(fd, &sb)) { warn("%s: fstat", file); (void)close(fd); @@ -192,22 +200,41 @@ cnt(file) } /* Do it the hard way... */ -word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) { - if (len == -1) { +word: gotsp = 1; + len = 0; + warned = 0; + while ((nread = read(fd, buf + len, MAXBSIZE - len)) != 0) { + if (nread == -1) { warn("%s: read", file); (void)close(fd); return (1); } - /* - * This loses in the presence of multi-byte characters. - * To do it right would require a function to return a - * character while knowing how many bytes it consumed. - */ - charct += len; - for (p = buf; len--;) { - ch = *p++; - if (ch == '\n') + len += nread; + p = buf; + while (len > 0) { + if (!domulti || MB_CUR_MAX == 1) { + clen = 1; + wch = (unsigned char)*p; + } else if ((clen = mbtowc(&wch, p, len)) <= 0) { + if (len > MB_CUR_MAX) { + clen = 1; + wch = (unsigned char)*p; + if (!warned) { + errno = EILSEQ; + warn("%s", file); + warned = 1; + } + } else { + memmove(buf, p, len); + break; + } + } + charct++; + len -= clen; + p += clen; + if (wch == L'\n') ++linect; + /* XXX Non-portable; should use iswspace() */ if (isspace(ch)) gotsp = 1; else if (gotsp) { @@ -224,7 +251,7 @@ word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) { twordct += wordct; (void)printf(" %7ju", wordct); } - if (dochar) { + if (dochar || domulti) { tcharct += charct; (void)printf(" %7ju", charct); } @@ -235,6 +262,6 @@ word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) { static void usage() { - (void)fprintf(stderr, "usage: wc [-clw] [file ...]\n"); + (void)fprintf(stderr, "usage: wc [-clmw] [file ...]\n"); exit(1); }