Add support for multibyte characters. While here, fix a longstanding bug in

the implementation of the -d option: we were skipping too many characters
when a non-alphanumeric character was encountered.
This commit is contained in:
Tim J. Robbins 2004-07-19 11:12:02 +00:00
parent 479918321d
commit d67148e425
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=132394
2 changed files with 67 additions and 58 deletions

View File

@ -106,12 +106,6 @@ A
utility appeared in utility appeared in
.At v7 . .At v7 .
.Sh BUGS .Sh BUGS
The
.Nm
utility does not recognize multibyte characters.
.Pp
Lines are not compared according to the current locale's collating
order.
Input files must be sorted with Input files must be sorted with
.Ev LC_COLLATE .Ev LC_COLLATE
set to set to

View File

@ -60,7 +60,6 @@ __FBSDID("$FreeBSD$");
#include <sys/mman.h> #include <sys/mman.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <ctype.h>
#include <err.h> #include <err.h>
#include <errno.h> #include <errno.h>
#include <fcntl.h> #include <fcntl.h>
@ -70,33 +69,25 @@ __FBSDID("$FreeBSD$");
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <unistd.h> #include <unistd.h>
#include <wchar.h>
#include <wctype.h>
#include "pathnames.h" #include "pathnames.h"
static char _path_words[] = _PATH_WORDS; static char _path_words[] = _PATH_WORDS;
/*
* FOLD and DICT convert characters to a normal form for comparison,
* according to the user specified flags.
*
* DICT expects integers because it uses a non-character value to
* indicate a character which should not participate in comparisons.
*/
#define EQUAL 0 #define EQUAL 0
#define GREATER 1 #define GREATER 1
#define LESS (-1) #define LESS (-1)
#define NO_COMPARE (-2)
#define FOLD(c) (isupper(c) ? tolower(c) : (unsigned char) (c))
#define DICT(c) (isalnum(c) ? (c) & 0xFF /* int */ : NO_COMPARE)
int dflag, fflag; int dflag, fflag;
char *binary_search(unsigned char *, unsigned char *, unsigned char *); char *binary_search(wchar_t *, unsigned char *, unsigned char *);
int compare(unsigned char *, unsigned char *, unsigned char *); int compare(wchar_t *, unsigned char *, unsigned char *);
char *linear_search(unsigned char *, unsigned char *, unsigned char *); char *linear_search(wchar_t *, unsigned char *, unsigned char *);
int look(unsigned char *, unsigned char *, unsigned char *); int look(wchar_t *, unsigned char *, unsigned char *);
void print_from(unsigned char *, unsigned char *, unsigned char *); wchar_t *prepkey(const char *, wchar_t);
void print_from(wchar_t *, unsigned char *, unsigned char *);
static void usage(void); static void usage(void);
@ -104,14 +95,16 @@ int
main(int argc, char *argv[]) main(int argc, char *argv[])
{ {
struct stat sb; struct stat sb;
int ch, fd, termchar, match; int ch, fd, match;
unsigned char *back, *front, *string, *p; wchar_t termchar;
unsigned char *back, *front;
unsigned const char *file; unsigned const char *file;
wchar_t *key;
(void) setlocale(LC_CTYPE, ""); (void) setlocale(LC_CTYPE, "");
file = _path_words; file = _path_words;
termchar = '\0'; termchar = L'\0';
while ((ch = getopt(argc, argv, "dft:")) != -1) while ((ch = getopt(argc, argv, "dft:")) != -1)
switch(ch) { switch(ch) {
case 'd': case 'd':
@ -121,7 +114,9 @@ main(int argc, char *argv[])
fflag = 1; fflag = 1;
break; break;
case 't': case 't':
termchar = *optarg; if (mbrtowc(&termchar, optarg, MB_LEN_MAX, NULL) !=
strlen(optarg))
errx(2, "invalid termination character");
break; break;
case '?': case '?':
default: default:
@ -134,12 +129,10 @@ main(int argc, char *argv[])
usage(); usage();
if (argc == 1) /* But set -df by default. */ if (argc == 1) /* But set -df by default. */
dflag = fflag = 1; dflag = fflag = 1;
string = *argv++; key = prepkey(*argv++, termchar);
if (argc >= 2) if (argc >= 2)
file = *argv++; file = *argv++;
if (termchar != '\0' && (p = strchr(string, termchar)) != NULL)
*++p = '\0';
match = 1; match = 1;
do { do {
@ -150,29 +143,47 @@ main(int argc, char *argv[])
if ((front = mmap(NULL, (size_t)sb.st_size, PROT_READ, MAP_SHARED, fd, (off_t)0)) == MAP_FAILED) if ((front = mmap(NULL, (size_t)sb.st_size, PROT_READ, MAP_SHARED, fd, (off_t)0)) == MAP_FAILED)
err(2, "%s", file); err(2, "%s", file);
back = front + sb.st_size; back = front + sb.st_size;
match *= (look(string, front, back)); match *= (look(key, front, back));
close(fd); close(fd);
} while (argc-- > 2 && (file = *argv++)); } while (argc-- > 2 && (file = *argv++));
exit(match); exit(match);
} }
int wchar_t *
look(unsigned char *string, unsigned char *front, unsigned char *back) prepkey(const char *string, wchar_t termchar)
{ {
int ch; const char *readp;
unsigned char *readp, *writep; wchar_t *key, *writep;
wchar_t ch;
size_t clen;
/* Reformat string string to avoid doing it multiple times later. */ /*
for (readp = writep = string; (ch = *readp++);) { * Reformat search string and convert to wide character representation
* to avoid doing it multiple times later.
*/
if ((key = malloc(sizeof(wchar_t) * (strlen(string) + 1))) == NULL)
err(2, NULL);
readp = string;
writep = key;
while ((clen = mbrtowc(&ch, readp, MB_LEN_MAX, NULL)) != 0) {
if (clen == (size_t)-1 || clen == (size_t)-2)
errc(2, EILSEQ, NULL);
if (fflag) if (fflag)
ch = FOLD(ch); ch = towlower(ch);
if (dflag) if (!dflag || iswalnum(ch))
ch = DICT(ch); *writep++ = ch;
if (ch != NO_COMPARE) readp += clen;
*(writep++) = ch;
} }
*writep = '\0'; *writep = L'\0';
if (termchar != L'\0' && (writep = wcschr(key, termchar)) != NULL)
*++writep = L'\0';
return (key);
}
int
look(wchar_t *string, unsigned char *front, unsigned char *back)
{
front = binary_search(string, front, back); front = binary_search(string, front, back);
front = linear_search(string, front, back); front = linear_search(string, front, back);
@ -225,7 +236,7 @@ look(unsigned char *string, unsigned char *front, unsigned char *back)
while (p < back && *p++ != '\n'); while (p < back && *p++ != '\n');
char * char *
binary_search(unsigned char *string, unsigned char *front, unsigned char *back) binary_search(wchar_t *string, unsigned char *front, unsigned char *back)
{ {
unsigned char *p; unsigned char *p;
@ -259,7 +270,7 @@ binary_search(unsigned char *string, unsigned char *front, unsigned char *back)
* o front is before or at the first line to be printed. * o front is before or at the first line to be printed.
*/ */
char * char *
linear_search(unsigned char *string, unsigned char *front, unsigned char *back) linear_search(wchar_t *string, unsigned char *front, unsigned char *back)
{ {
while (front < back) { while (front < back) {
switch (compare(string, front, back)) { switch (compare(string, front, back)) {
@ -281,7 +292,7 @@ linear_search(unsigned char *string, unsigned char *front, unsigned char *back)
* Print as many lines as match string, starting at front. * Print as many lines as match string, starting at front.
*/ */
void void
print_from(unsigned char *string, unsigned char *front, unsigned char *back) print_from(wchar_t *string, unsigned char *front, unsigned char *back)
{ {
for (; front < back && compare(string, front, back) == EQUAL; ++front) { for (; front < back && compare(string, front, back) == EQUAL; ++front) {
for (; front < back && *front != '\n'; ++front) for (; front < back && *front != '\n'; ++front)
@ -306,23 +317,27 @@ print_from(unsigned char *string, unsigned char *front, unsigned char *back)
* "back" terminated). * "back" terminated).
*/ */
int int
compare(unsigned char *s1, unsigned char *s2, unsigned char *back) compare(wchar_t *s1, unsigned char *s2, unsigned char *back)
{ {
int ch; wchar_t ch1, ch2;
size_t len2;
for (; *s1 && s2 < back && *s2 != '\n'; ++s1, ++s2) { for (; *s1 && s2 < back && *s2 != '\n'; ++s1, s2 += len2) {
ch = *s2; ch1 = *s1;
len2 = mbrtowc(&ch2, s2, back - s2, NULL);
if (len2 == (size_t)-1 || len2 == (size_t)-2) {
ch2 = *s2;
len2 = 1;
}
if (fflag) if (fflag)
ch = FOLD(ch); ch2 = towlower(ch2);
if (dflag) if (dflag && !iswalnum(ch2)) {
ch = DICT(ch); /* Ignore character in comparison. */
--s1;
if (ch == NO_COMPARE) {
++s2; /* Ignore character in comparison. */
continue; continue;
} }
if (*s1 != ch) if (ch1 != ch2)
return (*s1 < ch ? LESS : GREATER); return (ch1 < ch2 ? LESS : GREATER);
} }
return (*s1 ? GREATER : EQUAL); return (*s1 ? GREATER : EQUAL);
} }