Back out non-collating [a-z] ranges (r302594).
Instead of changing the whole course to another POSIX-permitted way for consistency and uniformity I decide to completely ignore missing regex fucntionality and focus on fixing bugs in what we have now, too many small obstacles we have choicing other way, counting ports. Corresponding libc changes are backed out in r302824.
This commit is contained in:
parent
12eae8c8f3
commit
5b4fa425ba
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=302825
@ -53,7 +53,7 @@ static int backslash(STR *, int *);
|
||||
static int bracket(STR *);
|
||||
static void genclass(STR *);
|
||||
static void genequiv(STR *);
|
||||
static int genrange(STR *);
|
||||
static int genrange(STR *, int);
|
||||
static void genseq(STR *);
|
||||
|
||||
wint_t
|
||||
@ -93,7 +93,7 @@ next(STR *s)
|
||||
}
|
||||
|
||||
/* We can start a range at any time. */
|
||||
if (s->str[0] == '-' && genrange(s))
|
||||
if (s->str[0] == '-' && genrange(s, is_octal))
|
||||
return (next(s));
|
||||
return (1);
|
||||
case RANGE:
|
||||
@ -237,16 +237,18 @@ genequiv(STR *s)
|
||||
}
|
||||
|
||||
static int
|
||||
genrange(STR *s)
|
||||
genrange(STR *s, int was_octal)
|
||||
{
|
||||
int stopval;
|
||||
int stopval, octal;
|
||||
char *savestart;
|
||||
int n, cnt, *p;
|
||||
size_t clen;
|
||||
wchar_t wc;
|
||||
|
||||
octal = 0;
|
||||
savestart = s->str;
|
||||
if (*++s->str == '\\')
|
||||
stopval = backslash(s, NULL);
|
||||
stopval = backslash(s, &octal);
|
||||
else {
|
||||
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
|
||||
if (clen == (size_t)-1 || clen == (size_t)-2)
|
||||
@ -254,13 +256,37 @@ genrange(STR *s)
|
||||
stopval = wc;
|
||||
s->str += clen;
|
||||
}
|
||||
if (stopval < s->lastch) {
|
||||
/*
|
||||
* XXX Characters are not ordered according to collating sequence in
|
||||
* multibyte locales.
|
||||
*/
|
||||
if (octal || was_octal || MB_CUR_MAX > 1) {
|
||||
if (stopval < s->lastch) {
|
||||
s->str = savestart;
|
||||
return (0);
|
||||
}
|
||||
s->cnt = stopval - s->lastch + 1;
|
||||
s->state = RANGE;
|
||||
--s->lastch;
|
||||
return (1);
|
||||
}
|
||||
if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
|
||||
s->str = savestart;
|
||||
return (0);
|
||||
}
|
||||
s->cnt = stopval - s->lastch + 1;
|
||||
s->state = RANGE;
|
||||
--s->lastch;
|
||||
if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
|
||||
err(1, "genrange() malloc");
|
||||
for (cnt = 0; cnt < NCHARS_SB; cnt++)
|
||||
if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
|
||||
charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
|
||||
*p++ = cnt;
|
||||
*p = OOBCH;
|
||||
n = p - s->set;
|
||||
|
||||
s->cnt = 0;
|
||||
s->state = SET;
|
||||
if (n > 1)
|
||||
mergesort(s->set, n, sizeof(*(s->set)), charcoll);
|
||||
return (1);
|
||||
}
|
||||
|
||||
|
@ -164,6 +164,14 @@ as defined by the collation sequence.
|
||||
If either or both of the range endpoints are octal sequences, it
|
||||
represents the range of specific coded values between the
|
||||
range endpoints, inclusive.
|
||||
.Pp
|
||||
.Bf Em
|
||||
See the
|
||||
.Sx COMPATIBILITY
|
||||
section below for an important note regarding
|
||||
differences in the way the current
|
||||
implementation interprets range expressions differently from
|
||||
previous implementations.
|
||||
.Ef
|
||||
.It [:class:]
|
||||
Represents all characters belonging to the defined character class.
|
||||
@ -299,16 +307,22 @@ Remove diacritical marks from all accented variants of the letter
|
||||
.Pp
|
||||
.Dl "tr \*q[=e=]\*q \*qe\*q"
|
||||
.Sh COMPATIBILITY
|
||||
Previous
|
||||
.Fx
|
||||
implementations of
|
||||
.Nm
|
||||
did not order characters in range expressions according to the current
|
||||
locale's collation order, making it possible to convert accented Latin
|
||||
characters from upper to lower case using
|
||||
locale's collation order, making it possible to convert unaccented Latin
|
||||
characters (esp.\& as found in English text) from upper to lower case using
|
||||
the traditional
|
||||
.Ux
|
||||
idiom of
|
||||
.Dq Li "tr A-Z a-z" .
|
||||
Since
|
||||
.Nm
|
||||
now obeys the locale's collation order, this idiom may not produce
|
||||
correct results when there is not a 1:1 mapping between lower and
|
||||
upper case, or when the order of characters within the two cases differs.
|
||||
As noted in the
|
||||
.Sx EXAMPLES
|
||||
section above, the character class expressions
|
||||
@ -320,9 +334,6 @@ should be used instead of explicit character ranges like
|
||||
and
|
||||
.Dq Li A-Z .
|
||||
.Pp
|
||||
.Dq Li [=equiv=]
|
||||
expression is implemented for single byte locales only.
|
||||
.Pp
|
||||
System V has historically implemented character ranges using the syntax
|
||||
.Dq Li [c-c]
|
||||
instead of the
|
||||
|
@ -68,8 +68,10 @@ static void usage(void);
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
static int carray[NCHARS_SB];
|
||||
struct cmap *map;
|
||||
struct cset *delete, *squeeze;
|
||||
int n, *p;
|
||||
int Cflag, cflag, dflag, sflag, isstring2;
|
||||
wint_t ch, cnt, lastch;
|
||||
|
||||
@ -252,7 +254,7 @@ main(int argc, char **argv)
|
||||
(void)next(&s2);
|
||||
}
|
||||
endloop:
|
||||
if (cflag || Cflag) {
|
||||
if (cflag || (Cflag && MB_CUR_MAX > 1)) {
|
||||
/*
|
||||
* This is somewhat tricky: since the character set is
|
||||
* potentially huge, we need to avoid allocating a map
|
||||
@ -270,11 +272,10 @@ main(int argc, char **argv)
|
||||
if (Cflag && !iswrune(cnt))
|
||||
continue;
|
||||
if (cmap_lookup(map, cnt) == OOBCH) {
|
||||
if (next(&s2)) {
|
||||
if (next(&s2))
|
||||
cmap_add(map, cnt, s2.lastch);
|
||||
if (sflag)
|
||||
cset_add(squeeze, s2.lastch);
|
||||
}
|
||||
if (sflag)
|
||||
cset_add(squeeze, s2.lastch);
|
||||
} else
|
||||
cmap_add(map, cnt, cnt);
|
||||
if ((s2.state == EOS || s2.state == INFINITE) &&
|
||||
@ -282,6 +283,30 @@ main(int argc, char **argv)
|
||||
break;
|
||||
}
|
||||
cmap_default(map, s2.lastch);
|
||||
} else if (Cflag) {
|
||||
for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) {
|
||||
if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt))
|
||||
*p++ = cnt;
|
||||
else
|
||||
cmap_add(map, cnt, cnt);
|
||||
}
|
||||
n = p - carray;
|
||||
if (Cflag && n > 1)
|
||||
(void)mergesort(carray, n, sizeof(*carray), charcoll);
|
||||
|
||||
s2.str = argv[1];
|
||||
s2.state = NORMAL;
|
||||
for (cnt = 0; cnt < n; cnt++) {
|
||||
(void)next(&s2);
|
||||
cmap_add(map, carray[cnt], s2.lastch);
|
||||
/*
|
||||
* Chars taken from s2 can be different this time
|
||||
* due to lack of complex upper/lower processing,
|
||||
* so fill string2 again to not miss some.
|
||||
*/
|
||||
if (sflag)
|
||||
cset_add(squeeze, s2.lastch);
|
||||
}
|
||||
}
|
||||
|
||||
cset_cache(squeeze);
|
||||
@ -326,6 +351,16 @@ setup(char *arg, STR *str, int cflag, int Cflag)
|
||||
return (cs);
|
||||
}
|
||||
|
||||
int
|
||||
charcoll(const void *a, const void *b)
|
||||
{
|
||||
static char sa[2], sb[2];
|
||||
|
||||
sa[0] = *(const int *)a;
|
||||
sb[0] = *(const int *)b;
|
||||
return (strcoll(sa, sb));
|
||||
}
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user