regexec: fix processing multibyte strings.

Matcher function incorrectly assumed that moffset that we get from
findmust is in bytes. Fix this by introducing a stepback function,
taking short path if MB_CUR_MAX is 1, and going back byte-by-byte,
checking if we have a legal character sequence otherwise.

PR:		153502
Reviewed by:	pfg, kevans
Approved by:	kib (mentor, implicit)
Differential revision:	https://reviews.freebsd.org/D18297
This commit is contained in:
Yuri Pankov 2018-11-23 15:49:18 +00:00
parent 76b6af6731
commit 63cbe8d1d9
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=340835
3 changed files with 82 additions and 2 deletions

View File

@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
*/
#ifdef SNAMES
#define stepback sstepback
#define matcher smatcher
#define walk swalk
#define dissect sdissect
@ -58,6 +59,7 @@ __FBSDID("$FreeBSD$");
#define match smat
#endif
#ifdef LNAMES
#define stepback lstepback
#define matcher lmatcher
#define walk lwalk
#define dissect ldissect
@ -68,6 +70,7 @@ __FBSDID("$FreeBSD$");
#define match lmat
#endif
#ifdef MNAMES
#define stepback mstepback
#define matcher mmatcher
#define walk mwalk
#define dissect mdissect
@ -141,6 +144,39 @@ static const char *pchar(int ch);
#define NOTE(s) /* nothing */
#endif
/*
* Given a multibyte string pointed to by start, step back nchar characters
* from current position pointed to by cur.
*/
static const char *
stepback(const char *start, const char *cur, int nchar)
{
const char *ret;
int wc, mbc;
mbstate_t mbs;
size_t clen;
if (MB_CUR_MAX == 1)
return ((cur - nchar) > start ? cur - nchar : NULL);
ret = cur;
for (wc = nchar; wc > 0; wc--) {
for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) {
if ((ret - mbc) < start)
return (NULL);
memset(&mbs, 0, sizeof(mbs));
clen = mbrtowc(NULL, ret - mbc, mbc, &mbs);
if (clen != (size_t)-1 && clen != (size_t)-2)
break;
}
if (mbc > MB_CUR_MAX)
return (NULL);
ret -= mbc;
}
return (ret);
}
/*
- matcher - the actual matching engine
== static int matcher(struct re_guts *g, const char *string, \
@ -244,8 +280,13 @@ matcher(struct re_guts *g,
ZAPSTATE(&m->mbs);
/* Adjust start according to moffset, to speed things up */
if (dp != NULL && g->moffset > -1)
start = ((dp - g->moffset) < start) ? start : dp - g->moffset;
if (dp != NULL && g->moffset > -1) {
const char *nstart;
nstart = stepback(start, dp, g->moffset);
if (nstart != NULL)
start = nstart;
}
SP("mloop", m->st, *start);
@ -1083,6 +1124,7 @@ pchar(int ch)
#endif
#endif
#undef stepback
#undef matcher
#undef walk
#undef dissect

View File

@ -2,6 +2,9 @@
PACKAGE= tests
# local test cases
ATF_TESTS_SH+= multibyte
.include "Makefile.inc"
.include "${.CURDIR:H}/Makefile.netbsd-tests"
.include <bsd.test.mk>

View File

@ -0,0 +1,35 @@
# $FreeBSD$
atf_test_case multibyte
multibyte_head()
{
atf_set "descr" "Check matching multibyte characters (PR153502)"
}
multibyte_body()
{
export LC_CTYPE="C.UTF-8"
printf 'é' | atf_check -o "inline:é" \
sed -ne '/^.$/p'
printf 'éé' | atf_check -o "inline:éé" \
sed -ne '/^..$/p'
printf 'aéa' | atf_check -o "inline:aéa" \
sed -ne '/a.a/p'
printf 'aéa'| atf_check -o "inline:aéa" \
sed -ne '/a.*a/p'
printf 'aaéaa' | atf_check -o "inline:aaéaa" \
sed -ne '/aa.aa/p'
printf 'aéaéa' | atf_check -o "inline:aéaéa" \
sed -ne '/a.a.a/p'
printf 'éa' | atf_check -o "inline:éa" \
sed -ne '/.a/p'
printf 'aéaa' | atf_check -o "inline:aéaa" \
sed -ne '/a.aa/p'
printf 'éaé' | atf_check -o "inline:éaé" \
sed -ne '/.a./p'
}
atf_init_test_cases()
{
atf_add_test_case multibyte
}