1) Following r302512 (remove collation support for [a-z]-ranges in libc)

remove collation support for a-z ranges here too.
It was implemented for single byte locales only in any case.

2) Reduce [Cc]flag loop to WCHAR_MAX, WINT_MAX here includes WEOF which is
not a character.

3) Optimize [Cc]flag case: don't repeatedly add the last character of
string2 to squeeze cset when string2 reach its EOS state.

4) Reflect in the manpage that [=equiv=] is implemented for single
byte locales only.
This commit is contained in:
Andrey A. Chernov 2016-07-11 21:23:50 +00:00
parent 9f2b8101d7
commit 1ef4039ac7
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=302594
3 changed files with 20 additions and 92 deletions

View File

@ -53,7 +53,7 @@ static int backslash(STR *, int *);
static int bracket(STR *);
static void genclass(STR *);
static void genequiv(STR *);
static int genrange(STR *, int);
static int genrange(STR *);
static void genseq(STR *);
wint_t
@ -93,7 +93,7 @@ next(STR *s)
}
/* We can start a range at any time. */
if (s->str[0] == '-' && genrange(s, is_octal))
if (s->str[0] == '-' && genrange(s))
return (next(s));
return (1);
case RANGE:
@ -237,18 +237,16 @@ genequiv(STR *s)
}
static int
genrange(STR *s, int was_octal)
genrange(STR *s)
{
int stopval, octal;
int stopval;
char *savestart;
int n, cnt, *p;
size_t clen;
wchar_t wc;
octal = 0;
savestart = s->str;
if (*++s->str == '\\')
stopval = backslash(s, &octal);
stopval = backslash(s, NULL);
else {
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
if (clen == (size_t)-1 || clen == (size_t)-2)
@ -256,37 +254,13 @@ genrange(STR *s, int was_octal)
stopval = wc;
s->str += clen;
}
/*
* XXX Characters are not ordered according to collating sequence in
* multibyte locales.
*/
if (octal || was_octal || MB_CUR_MAX > 1) {
if (stopval < s->lastch) {
s->str = savestart;
return (0);
}
s->cnt = stopval - s->lastch + 1;
s->state = RANGE;
--s->lastch;
return (1);
}
if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
if (stopval < s->lastch) {
s->str = savestart;
return (0);
}
if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
err(1, "genrange() malloc");
for (cnt = 0; cnt < NCHARS_SB; cnt++)
if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
*p++ = cnt;
*p = OOBCH;
n = p - s->set;
s->cnt = 0;
s->state = SET;
if (n > 1)
mergesort(s->set, n, sizeof(*(s->set)), charcoll);
s->cnt = stopval - s->lastch + 1;
s->state = RANGE;
--s->lastch;
return (1);
}

View File

@ -164,14 +164,6 @@ as defined by the collation sequence.
If either or both of the range endpoints are octal sequences, it
represents the range of specific coded values between the
range endpoints, inclusive.
.Pp
.Bf Em
See the
.Sx COMPATIBILITY
section below for an important note regarding
differences in the way the current
implementation interprets range expressions differently from
previous implementations.
.Ef
.It [:class:]
Represents all characters belonging to the defined character class.
@ -307,22 +299,16 @@ Remove diacritical marks from all accented variants of the letter
.Pp
.Dl "tr \*q[=e=]\*q \*qe\*q"
.Sh COMPATIBILITY
Previous
.Fx
implementations of
.Nm
did not order characters in range expressions according to the current
locale's collation order, making it possible to convert unaccented Latin
characters (esp.\& as found in English text) from upper to lower case using
locale's collation order, making it possible to convert accented Latin
characters from upper to lower case using
the traditional
.Ux
idiom of
.Dq Li "tr A-Z a-z" .
Since
.Nm
now obeys the locale's collation order, this idiom may not produce
correct results when there is not a 1:1 mapping between lower and
upper case, or when the order of characters within the two cases differs.
As noted in the
.Sx EXAMPLES
section above, the character class expressions
@ -334,6 +320,9 @@ should be used instead of explicit character ranges like
and
.Dq Li A-Z .
.Pp
.Dq Li [=equiv=]
expression is implemented for single byte locales only.
.Pp
System V has historically implemented character ranges using the syntax
.Dq Li [c-c]
instead of the

View File

@ -68,10 +68,8 @@ static void usage(void);
int
main(int argc, char **argv)
{
static int carray[NCHARS_SB];
struct cmap *map;
struct cset *delete, *squeeze;
int n, *p;
int Cflag, cflag, dflag, sflag, isstring2;
wint_t ch, cnt, lastch;
@ -254,7 +252,7 @@ main(int argc, char **argv)
(void)next(&s2);
}
endloop:
if (cflag || (Cflag && MB_CUR_MAX > 1)) {
if (cflag || Cflag) {
/*
* This is somewhat tricky: since the character set is
* potentially huge, we need to avoid allocating a map
@ -268,14 +266,15 @@ main(int argc, char **argv)
*/
s2.str = argv[1];
s2.state = NORMAL;
for (cnt = 0; cnt < WINT_MAX; cnt++) {
for (cnt = 0; cnt <= WCHAR_MAX; cnt++) {
if (Cflag && !iswrune(cnt))
continue;
if (cmap_lookup(map, cnt) == OOBCH) {
if (next(&s2))
if (next(&s2)) {
cmap_add(map, cnt, s2.lastch);
if (sflag)
cset_add(squeeze, s2.lastch);
if (sflag)
cset_add(squeeze, s2.lastch);
}
} else
cmap_add(map, cnt, cnt);
if ((s2.state == EOS || s2.state == INFINITE) &&
@ -283,30 +282,6 @@ main(int argc, char **argv)
break;
}
cmap_default(map, s2.lastch);
} else if (Cflag) {
for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) {
if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt))
*p++ = cnt;
else
cmap_add(map, cnt, cnt);
}
n = p - carray;
if (Cflag && n > 1)
(void)mergesort(carray, n, sizeof(*carray), charcoll);
s2.str = argv[1];
s2.state = NORMAL;
for (cnt = 0; cnt < n; cnt++) {
(void)next(&s2);
cmap_add(map, carray[cnt], s2.lastch);
/*
* Chars taken from s2 can be different this time
* due to lack of complex upper/lower processing,
* so fill string2 again to not miss some.
*/
if (sflag)
cset_add(squeeze, s2.lastch);
}
}
cset_cache(squeeze);
@ -351,16 +326,6 @@ setup(char *arg, STR *str, int cflag, int Cflag)
return (cs);
}
int
charcoll(const void *a, const void *b)
{
static char sa[2], sb[2];
sa[0] = *(const int *)a;
sb[0] = *(const int *)b;
return (strcoll(sa, sb));
}
static void
usage(void)
{