Back out non-collating [a-z] ranges (r302594).

Instead of changing the whole course to another POSIX-permitted way
for consistency and uniformity I decide to completely ignore missing
regex fucntionality and focus on fixing bugs in what we have now,
too many small obstacles we have choicing other way, counting ports.
Corresponding libc changes are backed out in r302824.
This commit is contained in:
Andrey A. Chernov 2016-07-14 09:19:53 +00:00
parent 12eae8c8f3
commit 5b4fa425ba
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=302825
3 changed files with 91 additions and 19 deletions

View File

@ -53,7 +53,7 @@ static int backslash(STR *, int *);
static int bracket(STR *);
static void genclass(STR *);
static void genequiv(STR *);
static int genrange(STR *);
static int genrange(STR *, int);
static void genseq(STR *);
wint_t
@ -93,7 +93,7 @@ next(STR *s)
}
/* We can start a range at any time. */
if (s->str[0] == '-' && genrange(s))
if (s->str[0] == '-' && genrange(s, is_octal))
return (next(s));
return (1);
case RANGE:
@ -237,16 +237,18 @@ genequiv(STR *s)
}
static int
genrange(STR *s)
genrange(STR *s, int was_octal)
{
int stopval;
int stopval, octal;
char *savestart;
int n, cnt, *p;
size_t clen;
wchar_t wc;
octal = 0;
savestart = s->str;
if (*++s->str == '\\')
stopval = backslash(s, NULL);
stopval = backslash(s, &octal);
else {
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
if (clen == (size_t)-1 || clen == (size_t)-2)
@ -254,13 +256,37 @@ genrange(STR *s)
stopval = wc;
s->str += clen;
}
if (stopval < s->lastch) {
/*
* XXX Characters are not ordered according to collating sequence in
* multibyte locales.
*/
if (octal || was_octal || MB_CUR_MAX > 1) {
if (stopval < s->lastch) {
s->str = savestart;
return (0);
}
s->cnt = stopval - s->lastch + 1;
s->state = RANGE;
--s->lastch;
return (1);
}
if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
s->str = savestart;
return (0);
}
s->cnt = stopval - s->lastch + 1;
s->state = RANGE;
--s->lastch;
if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
err(1, "genrange() malloc");
for (cnt = 0; cnt < NCHARS_SB; cnt++)
if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
*p++ = cnt;
*p = OOBCH;
n = p - s->set;
s->cnt = 0;
s->state = SET;
if (n > 1)
mergesort(s->set, n, sizeof(*(s->set)), charcoll);
return (1);
}

View File

@ -164,6 +164,14 @@ as defined by the collation sequence.
If either or both of the range endpoints are octal sequences, it
represents the range of specific coded values between the
range endpoints, inclusive.
.Pp
.Bf Em
See the
.Sx COMPATIBILITY
section below for an important note regarding
differences in the way the current
implementation interprets range expressions differently from
previous implementations.
.Ef
.It [:class:]
Represents all characters belonging to the defined character class.
@ -299,16 +307,22 @@ Remove diacritical marks from all accented variants of the letter
.Pp
.Dl "tr \*q[=e=]\*q \*qe\*q"
.Sh COMPATIBILITY
Previous
.Fx
implementations of
.Nm
did not order characters in range expressions according to the current
locale's collation order, making it possible to convert accented Latin
characters from upper to lower case using
locale's collation order, making it possible to convert unaccented Latin
characters (esp.\& as found in English text) from upper to lower case using
the traditional
.Ux
idiom of
.Dq Li "tr A-Z a-z" .
Since
.Nm
now obeys the locale's collation order, this idiom may not produce
correct results when there is not a 1:1 mapping between lower and
upper case, or when the order of characters within the two cases differs.
As noted in the
.Sx EXAMPLES
section above, the character class expressions
@ -320,9 +334,6 @@ should be used instead of explicit character ranges like
and
.Dq Li A-Z .
.Pp
.Dq Li [=equiv=]
expression is implemented for single byte locales only.
.Pp
System V has historically implemented character ranges using the syntax
.Dq Li [c-c]
instead of the

View File

@ -68,8 +68,10 @@ static void usage(void);
int
main(int argc, char **argv)
{
static int carray[NCHARS_SB];
struct cmap *map;
struct cset *delete, *squeeze;
int n, *p;
int Cflag, cflag, dflag, sflag, isstring2;
wint_t ch, cnt, lastch;
@ -252,7 +254,7 @@ main(int argc, char **argv)
(void)next(&s2);
}
endloop:
if (cflag || Cflag) {
if (cflag || (Cflag && MB_CUR_MAX > 1)) {
/*
* This is somewhat tricky: since the character set is
* potentially huge, we need to avoid allocating a map
@ -270,11 +272,10 @@ main(int argc, char **argv)
if (Cflag && !iswrune(cnt))
continue;
if (cmap_lookup(map, cnt) == OOBCH) {
if (next(&s2)) {
if (next(&s2))
cmap_add(map, cnt, s2.lastch);
if (sflag)
cset_add(squeeze, s2.lastch);
}
if (sflag)
cset_add(squeeze, s2.lastch);
} else
cmap_add(map, cnt, cnt);
if ((s2.state == EOS || s2.state == INFINITE) &&
@ -282,6 +283,30 @@ main(int argc, char **argv)
break;
}
cmap_default(map, s2.lastch);
} else if (Cflag) {
for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) {
if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt))
*p++ = cnt;
else
cmap_add(map, cnt, cnt);
}
n = p - carray;
if (Cflag && n > 1)
(void)mergesort(carray, n, sizeof(*carray), charcoll);
s2.str = argv[1];
s2.state = NORMAL;
for (cnt = 0; cnt < n; cnt++) {
(void)next(&s2);
cmap_add(map, carray[cnt], s2.lastch);
/*
* Chars taken from s2 can be different this time
* due to lack of complex upper/lower processing,
* so fill string2 again to not miss some.
*/
if (sflag)
cset_add(squeeze, s2.lastch);
}
}
cset_cache(squeeze);
@ -326,6 +351,16 @@ setup(char *arg, STR *str, int cflag, int Cflag)
return (cs);
}
int
charcoll(const void *a, const void *b)
{
static char sa[2], sb[2];
sa[0] = *(const int *)a;
sb[0] = *(const int *)b;
return (strcoll(sa, sb));
}
static void
usage(void)
{