1) Following r302512 (remove collation support for [a-z]-ranges in libc)
remove collation support for a-z ranges here too. It was implemented for single byte locales only in any case. 2) Reduce [Cc]flag loop to WCHAR_MAX, WINT_MAX here includes WEOF which is not a character. 3) Optimize [Cc]flag case: don't repeatedly add the last character of string2 to squeeze cset when string2 reach its EOS state. 4) Reflect in the manpage that [=equiv=] is implemented for single byte locales only.
This commit is contained in:
parent
3c9e90ee71
commit
6fb623e6e6
@ -53,7 +53,7 @@ static int backslash(STR *, int *);
|
||||
static int bracket(STR *);
|
||||
static void genclass(STR *);
|
||||
static void genequiv(STR *);
|
||||
static int genrange(STR *, int);
|
||||
static int genrange(STR *);
|
||||
static void genseq(STR *);
|
||||
|
||||
wint_t
|
||||
@ -93,7 +93,7 @@ next(STR *s)
|
||||
}
|
||||
|
||||
/* We can start a range at any time. */
|
||||
if (s->str[0] == '-' && genrange(s, is_octal))
|
||||
if (s->str[0] == '-' && genrange(s))
|
||||
return (next(s));
|
||||
return (1);
|
||||
case RANGE:
|
||||
@ -237,18 +237,16 @@ genequiv(STR *s)
|
||||
}
|
||||
|
||||
static int
|
||||
genrange(STR *s, int was_octal)
|
||||
genrange(STR *s)
|
||||
{
|
||||
int stopval, octal;
|
||||
int stopval;
|
||||
char *savestart;
|
||||
int n, cnt, *p;
|
||||
size_t clen;
|
||||
wchar_t wc;
|
||||
|
||||
octal = 0;
|
||||
savestart = s->str;
|
||||
if (*++s->str == '\\')
|
||||
stopval = backslash(s, &octal);
|
||||
stopval = backslash(s, NULL);
|
||||
else {
|
||||
clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
|
||||
if (clen == (size_t)-1 || clen == (size_t)-2)
|
||||
@ -256,37 +254,13 @@ genrange(STR *s, int was_octal)
|
||||
stopval = wc;
|
||||
s->str += clen;
|
||||
}
|
||||
/*
|
||||
* XXX Characters are not ordered according to collating sequence in
|
||||
* multibyte locales.
|
||||
*/
|
||||
if (octal || was_octal || MB_CUR_MAX > 1) {
|
||||
if (stopval < s->lastch) {
|
||||
s->str = savestart;
|
||||
return (0);
|
||||
}
|
||||
s->cnt = stopval - s->lastch + 1;
|
||||
s->state = RANGE;
|
||||
--s->lastch;
|
||||
return (1);
|
||||
}
|
||||
if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
|
||||
if (stopval < s->lastch) {
|
||||
s->str = savestart;
|
||||
return (0);
|
||||
}
|
||||
if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
|
||||
err(1, "genrange() malloc");
|
||||
for (cnt = 0; cnt < NCHARS_SB; cnt++)
|
||||
if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
|
||||
charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
|
||||
*p++ = cnt;
|
||||
*p = OOBCH;
|
||||
n = p - s->set;
|
||||
|
||||
s->cnt = 0;
|
||||
s->state = SET;
|
||||
if (n > 1)
|
||||
mergesort(s->set, n, sizeof(*(s->set)), charcoll);
|
||||
s->cnt = stopval - s->lastch + 1;
|
||||
s->state = RANGE;
|
||||
--s->lastch;
|
||||
return (1);
|
||||
}
|
||||
|
||||
|
@ -164,14 +164,6 @@ as defined by the collation sequence.
|
||||
If either or both of the range endpoints are octal sequences, it
|
||||
represents the range of specific coded values between the
|
||||
range endpoints, inclusive.
|
||||
.Pp
|
||||
.Bf Em
|
||||
See the
|
||||
.Sx COMPATIBILITY
|
||||
section below for an important note regarding
|
||||
differences in the way the current
|
||||
implementation interprets range expressions differently from
|
||||
previous implementations.
|
||||
.Ef
|
||||
.It [:class:]
|
||||
Represents all characters belonging to the defined character class.
|
||||
@ -307,22 +299,16 @@ Remove diacritical marks from all accented variants of the letter
|
||||
.Pp
|
||||
.Dl "tr \*q[=e=]\*q \*qe\*q"
|
||||
.Sh COMPATIBILITY
|
||||
Previous
|
||||
.Fx
|
||||
implementations of
|
||||
.Nm
|
||||
did not order characters in range expressions according to the current
|
||||
locale's collation order, making it possible to convert unaccented Latin
|
||||
characters (esp.\& as found in English text) from upper to lower case using
|
||||
locale's collation order, making it possible to convert accented Latin
|
||||
characters from upper to lower case using
|
||||
the traditional
|
||||
.Ux
|
||||
idiom of
|
||||
.Dq Li "tr A-Z a-z" .
|
||||
Since
|
||||
.Nm
|
||||
now obeys the locale's collation order, this idiom may not produce
|
||||
correct results when there is not a 1:1 mapping between lower and
|
||||
upper case, or when the order of characters within the two cases differs.
|
||||
As noted in the
|
||||
.Sx EXAMPLES
|
||||
section above, the character class expressions
|
||||
@ -334,6 +320,9 @@ should be used instead of explicit character ranges like
|
||||
and
|
||||
.Dq Li A-Z .
|
||||
.Pp
|
||||
.Dq Li [=equiv=]
|
||||
expression is implemented for single byte locales only.
|
||||
.Pp
|
||||
System V has historically implemented character ranges using the syntax
|
||||
.Dq Li [c-c]
|
||||
instead of the
|
||||
|
@ -68,10 +68,8 @@ static void usage(void);
|
||||
int
|
||||
main(int argc, char **argv)
|
||||
{
|
||||
static int carray[NCHARS_SB];
|
||||
struct cmap *map;
|
||||
struct cset *delete, *squeeze;
|
||||
int n, *p;
|
||||
int Cflag, cflag, dflag, sflag, isstring2;
|
||||
wint_t ch, cnt, lastch;
|
||||
|
||||
@ -254,7 +252,7 @@ main(int argc, char **argv)
|
||||
(void)next(&s2);
|
||||
}
|
||||
endloop:
|
||||
if (cflag || (Cflag && MB_CUR_MAX > 1)) {
|
||||
if (cflag || Cflag) {
|
||||
/*
|
||||
* This is somewhat tricky: since the character set is
|
||||
* potentially huge, we need to avoid allocating a map
|
||||
@ -268,14 +266,15 @@ endloop:
|
||||
*/
|
||||
s2.str = argv[1];
|
||||
s2.state = NORMAL;
|
||||
for (cnt = 0; cnt < WINT_MAX; cnt++) {
|
||||
for (cnt = 0; cnt <= WCHAR_MAX; cnt++) {
|
||||
if (Cflag && !iswrune(cnt))
|
||||
continue;
|
||||
if (cmap_lookup(map, cnt) == OOBCH) {
|
||||
if (next(&s2))
|
||||
if (next(&s2)) {
|
||||
cmap_add(map, cnt, s2.lastch);
|
||||
if (sflag)
|
||||
cset_add(squeeze, s2.lastch);
|
||||
if (sflag)
|
||||
cset_add(squeeze, s2.lastch);
|
||||
}
|
||||
} else
|
||||
cmap_add(map, cnt, cnt);
|
||||
if ((s2.state == EOS || s2.state == INFINITE) &&
|
||||
@ -283,30 +282,6 @@ endloop:
|
||||
break;
|
||||
}
|
||||
cmap_default(map, s2.lastch);
|
||||
} else if (Cflag) {
|
||||
for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) {
|
||||
if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt))
|
||||
*p++ = cnt;
|
||||
else
|
||||
cmap_add(map, cnt, cnt);
|
||||
}
|
||||
n = p - carray;
|
||||
if (Cflag && n > 1)
|
||||
(void)mergesort(carray, n, sizeof(*carray), charcoll);
|
||||
|
||||
s2.str = argv[1];
|
||||
s2.state = NORMAL;
|
||||
for (cnt = 0; cnt < n; cnt++) {
|
||||
(void)next(&s2);
|
||||
cmap_add(map, carray[cnt], s2.lastch);
|
||||
/*
|
||||
* Chars taken from s2 can be different this time
|
||||
* due to lack of complex upper/lower processing,
|
||||
* so fill string2 again to not miss some.
|
||||
*/
|
||||
if (sflag)
|
||||
cset_add(squeeze, s2.lastch);
|
||||
}
|
||||
}
|
||||
|
||||
cset_cache(squeeze);
|
||||
@ -351,16 +326,6 @@ setup(char *arg, STR *str, int cflag, int Cflag)
|
||||
return (cs);
|
||||
}
|
||||
|
||||
int
|
||||
charcoll(const void *a, const void *b)
|
||||
{
|
||||
static char sa[2], sb[2];
|
||||
|
||||
sa[0] = *(const int *)a;
|
||||
sb[0] = *(const int *)b;
|
||||
return (strcoll(sa, sb));
|
||||
}
|
||||
|
||||
static void
|
||||
usage(void)
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user