Add the -m option, which counts characters (as opposed to -c, which

counts bytes). In locales that don't have multibyte characters, -m is effectively an alias for -c. This brings wc(1) up to P1003.1-2001 conformance.
2002-06-13 12:48:50 +00:00 · 2002-06-13 12:48:50 +00:00 · ebb42aee31
commit ebb42aee31
parent a446b510a4
2 changed files with 83 additions and 24 deletions
--- a/usr.bin/wc/wc.1
+++ b/usr.bin/wc/wc.1
@ -40,10 +40,10 @@
 .Os
 .Sh NAME
 .Nm wc
-.Nd word, line, and byte count
+.Nd word, line, character, and byte count
 .Sh SYNOPSIS
 .Nm
-.Op Fl clw
+.Op Fl clmw
 .Op Ar
 .Sh DESCRIPTION
 The
@ -71,6 +71,12 @@ is written to the standard output.
 .It Fl l
 The number of lines in each input file
 is written to the standard output.
+.It Fl m
+The number of characters in each input file is written to the standard output.
+If the current locale does not support multibyte characters, this
+is equivalent to the
+.Fl c
+option.
 .It Fl w
 The number of words in each input file
 is written to the standard output.
@ -79,10 +85,36 @@ is written to the standard output.
 When an option is specified,
 .Nm
 only reports the information requested by that option.
-The default action is equivalent to specifying all of the flags.
+The default action is equivalent to specifying the
+.Fl c ,
+.Fl l
+and
+.Fl w
+options.
 .Pp
 If no files are specified, the standard input is used and no
 file name is displayed.
+.Sh ENVIRONMENT
+The
+.Ev LANG ,
+.Ev LC_ALL
+and
+.Ev LC_CTYPE
+environment variables affect the execution of
+.Nm
+as described in
+.Xr environ 7
+when the
+.Fl m
+option is specified.
+.Sh EXAMPLES
+Count the number of characters, words and lines in each of the files
+.Pa report1
+and
+.Pa report2
+as well as the totals for both:
+.Pp
+.Dl "wc -mlw report1 report2"
 .Sh DIAGNOSTICS
 .Ex -std
 .Sh SEE ALSO
@ -108,7 +140,7 @@ function, as required by
 The
 .Nm
 function conforms to
-.St -p1003.2 .
+.St -p1003.1-2001 .
 .Sh HISTORY
 A
 .Nm
--- a/usr.bin/wc/wc.c
+++ b/usr.bin/wc/wc.c
@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");

 #include <ctype.h>
 #include <err.h>
+#include <errno.h>
 #include <fcntl.h>
 #include <locale.h>
 #include <stdint.h>
@ -60,7 +61,7 @@ __FBSDID("$FreeBSD$");
 #include <unistd.h>

 uintmax_t tlinect, twordct, tcharct;
-int doline, doword, dochar;
+int doline, doword, dochar, domulti;

 static int	cnt(const char *);
 static void	usage(void);
@ -74,7 +75,7 @@ main(argc, argv)

 	(void) setlocale(LC_CTYPE, "");

-	while ((ch = getopt(argc, argv, "lwc")) != -1)
+	while ((ch = getopt(argc, argv, "clmw")) != -1)
 		switch((char)ch) {
 		case 'l':
 			doline = 1;
@ -84,6 +85,11 @@ main(argc, argv)
 			break;
 		case 'c':
 			dochar = 1;
+			domulti = 0;
+			break;
+		case 'm':
+			domulti = 1;
+			dochar = 0;
 			break;
 		case '?':
 		default:
@ -93,7 +99,7 @@ main(argc, argv)
 	argc -= optind;

 	/* Wc's flags are on by default. */
-	if (doline + doword + dochar == 0)
+	if (doline + doword + dochar + domulti == 0)
 		doline = doword = dochar = 1;

 	errors = 0;
@ -117,7 +123,7 @@ main(argc, argv)
 			(void)printf(" %7ju", tlinect);
 		if (doword)
 			(void)printf(" %7ju", twordct);
-		if (dochar)
+		if (dochar || domulti)
 			(void)printf(" %7ju", tcharct);
 		(void)printf(" total\n");
 	}
@ -130,10 +136,12 @@ cnt(file)
 {
 	struct stat sb;
 	uintmax_t linect, wordct, charct;
-	int fd, len;
+	ssize_t nread;
+	int clen, fd, len, warned;
 	short gotsp;
 	u_char *p;
 	u_char buf[MAXBSIZE], ch;
+	wchar_t wch;

 	linect = wordct = charct = 0;
 	if (file == NULL) {
@ -144,7 +152,7 @@ cnt(file)
 			warn("%s: open", file);
 			return (1);
 		}
-		if (doword)
+		if (doword || (domulti && MB_CUR_MAX != 1))
 			goto word;
 		/*
 		 * Line counting is split out because it's a lot faster to get
@ -176,7 +184,7 @@ cnt(file)
 		 * If all we need is the number of characters and it's a
 		 * regular or linked file, just stat the puppy.
 		 */
-		if (dochar) {
+		if (dochar || domulti) {
 			if (fstat(fd, &sb)) {
 				warn("%s: fstat", file);
 				(void)close(fd);
@ -192,22 +200,41 @@ cnt(file)
 	}

 	/* Do it the hard way... */
-word:	for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
-		if (len == -1) {
+word:	gotsp = 1;
+	len = 0;
+	warned = 0;
+	while ((nread = read(fd, buf + len, MAXBSIZE - len)) != 0) {
+		if (nread == -1) {
 			warn("%s: read", file);
 			(void)close(fd);
 			return (1);
 		}
-		/*
-		 * This loses in the presence of multi-byte characters.
-		 * To do it right would require a function to return a
-		 * character while knowing how many bytes it consumed.
-		 */
-		charct += len;
-		for (p = buf; len--;) {
-			ch = *p++;
-			if (ch == '\n')
+		len += nread;
+		p = buf;
+		while (len > 0) {
+			if (!domulti || MB_CUR_MAX == 1) {
+				clen = 1;
+				wch = (unsigned char)*p;
+			} else if ((clen = mbtowc(&wch, p, len)) <= 0) {
+				if (len > MB_CUR_MAX) {
+					clen = 1;
+					wch = (unsigned char)*p;
+					if (!warned) {
+						errno = EILSEQ;
+						warn("%s", file);
+						warned = 1;
+					}
+				} else {
+					memmove(buf, p, len);
+					break;
+				}
+			}
+			charct++;
+			len -= clen;
+			p += clen;
+			if (wch == L'\n')
 				++linect;
+			/* XXX Non-portable; should use iswspace() */
 			if (isspace(ch))
 				gotsp = 1;
 			else if (gotsp) {
@ -224,7 +251,7 @@ word:	for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
 		twordct += wordct;
 		(void)printf(" %7ju", wordct);
 	}
-	if (dochar) {
+	if (dochar || domulti) {
 		tcharct += charct;
 		(void)printf(" %7ju", charct);
 	}
@ -235,6 +262,6 @@ word:	for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
 static void
 usage()
 {
-	(void)fprintf(stderr, "usage: wc [-clw] [file ...]\n");
+	(void)fprintf(stderr, "usage: wc [-clmw] [file ...]\n");
 	exit(1);
 }