regcomp: reduce size of bitmap for multibyte locales

This fixes the obscure endless loop seen with case-insensitive
patterns containing characters in 128-255 range;  originally
found running GNU grep test suite.

Our regex implementation being kludgy translates the characters
in case-insensitive pattern to bracket expression containing both
cases for the character and doesn't correctly handle the case when
original character is in bitmap and the other case is not, falling
into the endless loop going through in p_bracket(), ordinary(),
and bothcases().

Reducing the bitmap to 0-127 range for multibyte locales solves this
as none of these characters have other case mapping outside of bitmap.
We are also safe in the case when the original character outside of
bitmap has other case mapping in the bitmap (there are several of those
in our current ctype maps having unidirectional mapping into bitmap).

Reviewed by:	bapt, kevans, pfg
Differential revision:	https://reviews.freebsd.org/D18302
This commit is contained in:
Yuri Pankov 2018-12-12 04:23:00 +00:00
parent 7bdc329113
commit 547bc083d6
4 changed files with 44 additions and 11 deletions

View File

@ -1841,21 +1841,29 @@ computejumps(struct parse *p, struct re_guts *g)
{
int ch;
int mindex;
int cmin, cmax;
/*
* For UTF-8 we process only the first 128 characters corresponding to
* the POSIX locale.
*/
cmin = MB_CUR_MAX == 1 ? CHAR_MIN : 0;
cmax = MB_CUR_MAX == 1 ? CHAR_MAX : 127;
/* Avoid making errors worse */
if (p->error != 0)
return;
g->charjump = (int*) malloc((NC + 1) * sizeof(int));
g->charjump = (int *)malloc((cmax - cmin + 1) * sizeof(int));
if (g->charjump == NULL) /* Not a fatal error */
return;
/* Adjust for signed chars, if necessary */
g->charjump = &g->charjump[-(CHAR_MIN)];
g->charjump = &g->charjump[-(cmin)];
/* If the character does not exist in the pattern, the jump
* is equal to the number of characters in the pattern.
*/
for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
for (ch = cmin; ch < cmax + 1; ch++)
g->charjump[ch] = g->mlen;
/* If the character does exist, compute the jump that would

View File

@ -113,7 +113,7 @@ typedef struct {
wint_t max;
} crange;
typedef struct {
unsigned char bmp[NC / 8];
unsigned char bmp[NC_MAX / 8];
wctype_t *types;
unsigned int ntypes;
wint_t *wides;
@ -133,9 +133,14 @@ CHIN1(cset *cs, wint_t ch)
if (ch < NC)
return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^
cs->invert);
for (i = 0; i < cs->nwides; i++)
if (ch == cs->wides[i])
for (i = 0; i < cs->nwides; i++) {
if (cs->icase) {
if (ch == towlower(cs->wides[i]) ||
ch == towupper(cs->wides[i]))
return (!cs->invert);
} else if (ch == cs->wides[i])
return (!cs->invert);
}
for (i = 0; i < cs->nranges; i++)
if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max)
return (!cs->invert);

View File

@ -39,7 +39,9 @@
/* utility definitions */
#define DUPMAX _POSIX2_RE_DUP_MAX /* xxx is this right? */
#define INFINITY (DUPMAX + 1)
#define NC (CHAR_MAX - CHAR_MIN + 1)
#define NC_MAX (CHAR_MAX - CHAR_MIN + 1)
#define NC ((MB_CUR_MAX) == 1 ? (NC_MAX) : (128))
typedef unsigned char uch;
/* switch off assertions (if not already off) if no REDEBUG */

View File

@ -1,11 +1,11 @@
# $FreeBSD$
atf_test_case multibyte
multibyte_head()
atf_test_case bmpat
bmpat_head()
{
atf_set "descr" "Check matching multibyte characters (PR153502)"
}
multibyte_body()
bmpat_body()
{
export LC_CTYPE="C.UTF-8"
@ -29,7 +29,25 @@ multibyte_body()
sed -ne '/.a./p'
}
atf_test_case icase
icase_head()
{
atf_set "descr" "Check case-insensitive matching for characters 128-255"
}
icase_body()
{
export LC_CTYPE="C.UTF-8"
a=$(printf '\302\265\n') # U+00B5
b=$(printf '\316\234\n') # U+039C
c=$(printf '\316\274\n') # U+03BC
echo $b | atf_check -o "inline:$b\n" sed -ne "/$a/Ip"
echo $c | atf_check -o "inline:$c\n" sed -ne "/$a/Ip"
}
atf_init_test_cases()
{
atf_add_test_case multibyte
atf_add_test_case bmpat
atf_add_test_case icase
}