Make regular expression matching aware of multibyte characters. The general

idea is that we perform multibyte->wide character conversion while parsing
and compiling, then convert byte sequences to wide characters when they're
needed for comparison and stepping through the string during execution.

As with tr(1), the main complication is to efficiently represent sets of
characters in bracket expressions. The old bitmap representation is replaced
by a bitmap for the first 256 characters combined with a vector of individual
wide characters, a vector of character ranges (for [A-Z] etc.), and a vector
of character classes (for [[:alpha:]] etc.).

One other point of interest is that although the Boyer-Moore algorithm had
to be disabled in the general multibyte case, it is still enabled for UTF-8
because of its self-synchronizing nature. This greatly speeds up matching
by reducing the number of multibyte conversions that need to be done.
This commit is contained in:
tjr 2004-07-12 07:35:59 +00:00
parent 9ca87da3c6
commit 7217fd1765
5 changed files with 494 additions and 340 deletions

View File

@ -69,6 +69,17 @@ __FBSDID("$FreeBSD$");
#define at lat
#define match lmat
#endif
#ifdef MNAMES
#define matcher mmatcher
#define fast mfast
#define slow mslow
#define dissect mdissect
#define backref mbackref
#define step mstep
#define print mprint
#define at mat
#define match mmat
#endif
/* another structure passed up and down to avoid zillions of parameters */
struct match {
@ -85,6 +96,7 @@ struct match {
states fresh; /* states for a fresh start */
states tmp; /* temporary */
states empty; /* empty set of states */
mbstate_t mbs; /* multibyte conversion state */
};
/* ========= begin header generated by ./mkh ========= */
@ -98,16 +110,15 @@ static char *dissect(struct match *m, char *start, char *stop, sopno startst, so
static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft);
#define BOL (OUT+1)
#define EOL (BOL+1)
#define BOLEOL (BOL+2)
#define NOTHING (BOL+3)
#define BOW (BOL+4)
#define EOW (BOL+5)
#define CODEMAX (BOL+5) /* highest code used */
#define NONCHAR(c) ((c) > CHAR_MAX)
#define NNONCHAR (CODEMAX-CHAR_MAX)
static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft);
#define BOL (OUT-1)
#define EOL (BOL-1)
#define BOLEOL (BOL-2)
#define NOTHING (BOL-3)
#define BOW (BOL-4)
#define EOW (BOL-5)
#define BADCHAR (BOL-6)
#define NONCHAR(c) ((c) <= OUT)
#ifdef REDEBUG
static void print(struct match *m, char *caption, states st, int ch, FILE *d);
#endif
@ -234,6 +245,7 @@ int eflags;
SETUP(m->tmp);
SETUP(m->empty);
CLEAR(m->empty);
ZAPSTATE(&m->mbs);
/* Adjust start according to moffset, to speed things up */
if (g->moffset > -1)
@ -257,7 +269,8 @@ int eflags;
if (endp != NULL)
break;
assert(m->coldp < m->endp);
m->coldp++;
m->coldp += XMBRTOWC(NULL, m->coldp,
m->endp - m->coldp, &m->mbs, 0);
}
if (nmatch == 1 && !g->backrefs)
break; /* no further info needed */
@ -316,7 +329,9 @@ int eflags;
/* despite initial appearances, there is no match here */
NOTE("false alarm");
start = m->coldp + 1; /* recycle starting later */
/* recycle starting later */
start = m->coldp + XMBRTOWC(NULL, m->coldp,
m->endp - m->coldp, &m->mbs, 0);
assert(start <= stop);
}
@ -394,7 +409,7 @@ sopno stopst;
assert(nope);
break;
case OCHAR:
sp++;
sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
break;
case OBOL:
case OEOL:
@ -403,7 +418,7 @@ sopno stopst;
break;
case OANY:
case OANYOF:
sp++;
sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
break;
case OBACK_:
case O_BACK:
@ -558,6 +573,7 @@ sopno lev; /* PLUS nesting level */
sop s;
regoff_t offsave;
cset *cs;
wint_t wc;
AT("back", start, stop, startst, stopst);
sp = start;
@ -567,17 +583,25 @@ sopno lev; /* PLUS nesting level */
for (ss = startst; !hard && ss < stopst; ss++)
switch (OP(s = m->g->strip[ss])) {
case OCHAR:
if (sp == stop || *sp++ != (char)OPND(s))
if (sp == stop)
return(NULL);
sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
if (wc != OPND(s))
return(NULL);
break;
case OANY:
if (sp == stop)
return(NULL);
sp++;
sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
if (wc == BADCHAR)
return (NULL);
break;
case OANYOF:
if (sp == stop)
return (NULL);
cs = &m->g->sets[OPND(s)];
if (sp == stop || !CHIN(cs, *sp++))
sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
if (wc == BADCHAR || !CHIN(cs, wc))
return(NULL);
break;
case OBOL:
@ -754,11 +778,12 @@ sopno stopst;
states fresh = m->fresh;
states tmp = m->tmp;
char *p = start;
int c = (start == m->beginp) ? OUT : *(start-1);
int lastc; /* previous c */
int flagch;
wint_t c;
wint_t lastc; /* previous c */
wint_t flagch;
int i;
char *coldp; /* last p after which no match was underway */
size_t clen;
CLEAR(st);
SET1(st, startst);
@ -766,10 +791,23 @@ sopno stopst;
ASSIGN(fresh, st);
SP("start", st, *p);
coldp = NULL;
if (start == m->beginp)
c = OUT;
else {
/*
* XXX Wrong if the previous character was multi-byte.
* Newline never is (in encodings supported by FreeBSD),
* so this only breaks the ISWORD tests below.
*/
c = (uch)*(start - 1);
}
for (;;) {
/* next character */
lastc = c;
c = (p == m->endp) ? OUT : *p;
if (p == m->endp)
c = OUT;
else
clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
if (EQ(st, fresh))
coldp = p;
@ -817,13 +855,13 @@ sopno stopst;
st = step(m->g, startst, stopst, tmp, c, st);
SP("aft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
p++;
p += clen;
}
assert(coldp != NULL);
m->coldp = coldp;
if (ISSET(st, stopst))
return(p+1);
return(p+XMBRTOWC(NULL, p, m->endp - p, &m->mbs, 0));
else
return(NULL);
}
@ -845,11 +883,12 @@ sopno stopst;
states empty = m->empty;
states tmp = m->tmp;
char *p = start;
int c = (start == m->beginp) ? OUT : *(start-1);
int lastc; /* previous c */
int flagch;
wint_t c;
wint_t lastc; /* previous c */
wint_t flagch;
int i;
char *matchp; /* last p at which a match ended */
size_t clen;
AT("slow", start, stop, startst, stopst);
CLEAR(st);
@ -857,10 +896,24 @@ sopno stopst;
SP("sstart", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st);
matchp = NULL;
if (start == m->beginp)
c = OUT;
else {
/*
* XXX Wrong if the previous character was multi-byte.
* Newline never is (in encodings supported by FreeBSD),
* so this only breaks the ISWORD tests below.
*/
c = (uch)*(start - 1);
}
for (;;) {
/* next character */
lastc = c;
c = (p == m->endp) ? OUT : *p;
if (p == m->endp) {
c = OUT;
clen = 0;
} else
clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
/* is there an EOL and/or BOL between lastc and c? */
flagch = '\0';
@ -908,7 +961,7 @@ sopno stopst;
st = step(m->g, startst, stopst, tmp, c, st);
SP("saft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
p++;
p += clen;
}
return(matchp);
@ -919,15 +972,14 @@ sopno stopst;
- step - map set of states reachable before char to set reachable after
== static states step(struct re_guts *g, sopno start, sopno stop, \
== states bef, int ch, states aft);
== #define BOL (OUT+1)
== #define EOL (BOL+1)
== #define BOLEOL (BOL+2)
== #define NOTHING (BOL+3)
== #define BOW (BOL+4)
== #define EOW (BOL+5)
== #define CODEMAX (BOL+5) // highest code used
== #define NONCHAR(c) ((c) > CHAR_MAX)
== #define NNONCHAR (CODEMAX-CHAR_MAX)
== #define BOL (OUT-1)
== #define EOL (BOL-1)
== #define BOLEOL (BOL-2)
== #define NOTHING (BOL-3)
== #define BOW (BOL-4)
== #define EOW (BOL-5)
== #define BADCHAR (BOL-6)
== #define NONCHAR(c) ((c) <= OUT)
*/
static states
step(g, start, stop, bef, ch, aft)
@ -935,7 +987,7 @@ struct re_guts *g;
sopno start; /* start state within strip */
sopno stop; /* state after stop state within strip */
states bef; /* states reachable before */
int ch; /* character or NONCHAR code */
wint_t ch; /* character or NONCHAR code */
states aft; /* states already known reachable after */
{
cset *cs;
@ -953,8 +1005,8 @@ states aft; /* states already known reachable after */
break;
case OCHAR:
/* only characters can match */
assert(!NONCHAR(ch) || ch != (char)OPND(s));
if (ch == (char)OPND(s))
assert(!NONCHAR(ch) || ch != OPND(s));
if (ch == OPND(s))
FWD(aft, bef, 1);
break;
case OBOL:

View File

@ -50,13 +50,14 @@ __FBSDID("$FreeBSD$");
#include <limits.h>
#include <stdlib.h>
#include <regex.h>
#include <wchar.h>
#include <wctype.h>
#include "collate.h"
#include "utils.h"
#include "regex2.h"
#include "cclass.h"
#include "cname.h"
/*
@ -83,29 +84,30 @@ extern "C" {
#endif
/* === regcomp.c === */
static void p_ere(struct parse *p, int stop);
static void p_ere(struct parse *p, wint_t stop);
static void p_ere_exp(struct parse *p);
static void p_str(struct parse *p);
static void p_bre(struct parse *p, int end1, int end2);
static void p_bre(struct parse *p, wint_t end1, wint_t end2);
static int p_simp_re(struct parse *p, int starordinary);
static int p_count(struct parse *p);
static void p_bracket(struct parse *p);
static void p_b_term(struct parse *p, cset *cs);
static void p_b_cclass(struct parse *p, cset *cs);
static void p_b_eclass(struct parse *p, cset *cs);
static char p_b_symbol(struct parse *p);
static char p_b_coll_elem(struct parse *p, int endc);
static char othercase(int ch);
static void bothcases(struct parse *p, int ch);
static void ordinary(struct parse *p, int ch);
static wint_t p_b_symbol(struct parse *p);
static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
static wint_t othercase(wint_t ch);
static void bothcases(struct parse *p, wint_t ch);
static void ordinary(struct parse *p, wint_t ch);
static void nonnewline(struct parse *p);
static void repeat(struct parse *p, sopno start, int from, int to);
static int seterr(struct parse *p, int e);
static cset *allocset(struct parse *p);
static void freeset(struct parse *p, cset *cs);
static int freezeset(struct parse *p, cset *cs);
static int firstch(struct parse *p, cset *cs);
static int nch(struct parse *p, cset *cs);
static void CHadd(struct parse *p, cset *cs, wint_t ch);
static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
static wint_t singleton(cset *cs);
static sopno dupl(struct parse *p, sopno start, sopno finish);
static void doemit(struct parse *p, sop op, size_t opnd);
static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
@ -117,6 +119,7 @@ static int altoffset(sop *scan, int offset);
static void computejumps(struct parse *p, struct re_guts *g);
static void computematchjumps(struct parse *p, struct re_guts *g);
static sopno pluscount(struct parse *p, struct re_guts *g);
static wint_t wgetnext(struct parse *p);
#ifdef __cplusplus
}
@ -141,6 +144,7 @@ static char nuls[10]; /* place to point scanner in event of error */
#define NEXT2() (p->next += 2)
#define NEXTn(n) (p->next += (n))
#define GETNEXT() (*p->next++)
#define WGETNEXT() wgetnext(p)
#define SETERROR(e) seterr(p, (e))
#define REQUIRE(co, e) ((co) || SETERROR(e))
#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
@ -226,9 +230,7 @@ int cflags;
p->pbegin[i] = 0;
p->pend[i] = 0;
}
g->csetsize = NC;
g->sets = NULL;
g->setbits = NULL;
g->ncsets = 0;
g->cflags = cflags;
g->iflags = 0;
@ -340,6 +342,7 @@ p_ere_exp(p)
struct parse *p;
{
char c;
wint_t wc;
sopno pos;
int count;
int count2;
@ -409,14 +412,16 @@ struct parse *p;
break;
case '\\':
(void)REQUIRE(MORE(), REG_EESCAPE);
c = GETNEXT();
ordinary(p, c);
wc = WGETNEXT();
ordinary(p, wc);
break;
case '{': /* okay as ordinary except if digit follows */
(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
/* FALLTHROUGH */
default:
ordinary(p, c);
p->next--;
wc = WGETNEXT();
ordinary(p, wc);
break;
}
@ -490,7 +495,7 @@ struct parse *p;
{
(void)REQUIRE(MORE(), REG_EMPTY);
while (MORE())
ordinary(p, GETNEXT());
ordinary(p, WGETNEXT());
}
/*
@ -546,6 +551,7 @@ int starordinary; /* is a leading * an ordinary character? */
int count2;
sopno pos;
int i;
wint_t wc;
sopno subno;
# define BACKSL (1<<CHAR_BIT)
@ -617,7 +623,9 @@ int starordinary; /* is a leading * an ordinary character? */
(void)REQUIRE(starordinary, REG_BADRPT);
/* FALLTHROUGH */
default:
ordinary(p, (char)c);
p->next--;
wc = WGETNEXT();
ordinary(p, wc);
break;
}
@ -673,16 +681,13 @@ struct parse *p;
/*
- p_bracket - parse a bracketed character list
== static void p_bracket(struct parse *p);
*
* Note a significant property of this code: if the allocset() did SETERROR,
* no set operations are done.
*/
static void
p_bracket(p)
struct parse *p;
{
cset *cs = allocset(p);
int invert = 0;
cset *cs;
wint_t ch;
/* Dept of Truly Sickening Special-Case Kludges */
if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
@ -696,49 +701,34 @@ struct parse *p;
return;
}
if ((cs = allocset(p)) == NULL)
return;
if (p->g->cflags&REG_ICASE)
cs->icase = 1;
if (EAT('^'))
invert++; /* make note to invert set at end */
cs->invert = 1;
if (EAT(']'))
CHadd(cs, ']');
CHadd(p, cs, ']');
else if (EAT('-'))
CHadd(cs, '-');
CHadd(p, cs, '-');
while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
p_b_term(p, cs);
if (EAT('-'))
CHadd(cs, '-');
CHadd(p, cs, '-');
(void)MUSTEAT(']', REG_EBRACK);
if (p->error != 0) /* don't mess things up further */
return;
if (p->g->cflags&REG_ICASE) {
int i;
int ci;
if (cs->invert && p->g->cflags&REG_NEWLINE)
cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
for (i = p->g->csetsize - 1; i >= 0; i--)
if (CHIN(cs, i) && isalpha(i)) {
ci = othercase(i);
if (ci != i)
CHadd(cs, ci);
}
}
if (invert) {
int i;
for (i = p->g->csetsize - 1; i >= 0; i--)
if (CHIN(cs, i))
CHsub(cs, i);
else
CHadd(cs, i);
if (p->g->cflags&REG_NEWLINE)
CHsub(cs, '\n');
}
if (nch(p, cs) == 1) { /* optimize singleton sets */
ordinary(p, firstch(p, cs));
if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
ordinary(p, ch);
freeset(p, cs);
} else
EMIT(OANYOF, freezeset(p, cs));
EMIT(OANYOF, (int)(cs - p->g->sets));
}
/*
@ -751,8 +741,8 @@ struct parse *p;
cset *cs;
{
char c;
char start, finish;
int i;
wint_t start, finish;
wint_t i;
/* classify what we've got */
switch ((MORE()) ? PEEK() : '\0') {
@ -799,19 +789,18 @@ cset *cs;
} else
finish = start;
if (start == finish)
CHadd(cs, start);
CHadd(p, cs, start);
else {
if (__collate_load_error) {
(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
for (i = (uch)start; i <= (uch)finish; i++)
CHadd(cs, i);
CHaddrange(p, cs, start, finish);
} else {
(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
for (i = 0; i <= UCHAR_MAX; i++) {
if ( __collate_range_cmp(start, i) <= 0
&& __collate_range_cmp(i, finish) <= 0
)
CHadd(cs, i);
CHadd(p, cs, i);
}
}
}
@ -828,85 +817,25 @@ p_b_cclass(p, cs)
struct parse *p;
cset *cs;
{
int c;
char *sp = p->next;
struct cclass *cp;
size_t len;
wctype_t wct;
char clname[16];
while (MORE() && isalpha((uch)PEEK()))
NEXT();
len = p->next - sp;
for (cp = cclasses; cp->name != NULL; cp++)
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
break;
if (cp->name == NULL) {
/* oops, didn't find it */
if (len >= sizeof(clname) - 1) {
SETERROR(REG_ECTYPE);
return;
}
switch (cp->fidx) {
case CALNUM:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (isalnum((uch)c))
CHadd(cs, c);
break;
case CALPHA:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (isalpha((uch)c))
CHadd(cs, c);
break;
case CBLANK:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (isblank((uch)c))
CHadd(cs, c);
break;
case CCNTRL:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (iscntrl((uch)c))
CHadd(cs, c);
break;
case CDIGIT:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (isdigit((uch)c))
CHadd(cs, c);
break;
case CGRAPH:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (isgraph((uch)c))
CHadd(cs, c);
break;
case CLOWER:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (islower((uch)c))
CHadd(cs, c);
break;
case CPRINT:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (isprint((uch)c))
CHadd(cs, c);
break;
case CPUNCT:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (ispunct((uch)c))
CHadd(cs, c);
break;
case CSPACE:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (isspace((uch)c))
CHadd(cs, c);
break;
case CUPPER:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (isupper((uch)c))
CHadd(cs, c);
break;
case CXDIGIT:
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
if (isxdigit((uch)c))
CHadd(cs, c);
break;
memcpy(clname, sp, len);
clname[len] = '\0';
if ((wct = wctype(clname)) == 0) {
SETERROR(REG_ECTYPE);
return;
}
CHaddtype(p, cs, wct);
}
/*
@ -920,25 +849,25 @@ p_b_eclass(p, cs)
struct parse *p;
cset *cs;
{
char c;
wint_t c;
c = p_b_coll_elem(p, '=');
CHadd(cs, c);
CHadd(p, cs, c);
}
/*
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
== static char p_b_symbol(struct parse *p);
*/
static char /* value of symbol */
static wint_t /* value of symbol */
p_b_symbol(p)
struct parse *p;
{
char value;
wint_t value;
(void)REQUIRE(MORE(), REG_EBRACK);
if (!EATTWO('[', '.'))
return(GETNEXT());
return(WGETNEXT());
/* collating symbol */
value = p_b_coll_elem(p, '.');
@ -950,14 +879,17 @@ struct parse *p;
- p_b_coll_elem - parse a collating-element name and look it up
== static char p_b_coll_elem(struct parse *p, int endc);
*/
static char /* value of collating element */
static wint_t /* value of collating element */
p_b_coll_elem(p, endc)
struct parse *p;
int endc; /* name ended by endc,']' */
wint_t endc; /* name ended by endc,']' */
{
char *sp = p->next;
struct cname *cp;
int len;
mbstate_t mbs;
wchar_t wc;
size_t clen;
while (MORE() && !SEETWO(endc, ']'))
NEXT();
@ -969,9 +901,13 @@ int endc; /* name ended by endc,']' */
for (cp = cnames; cp->name != NULL; cp++)
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
return(cp->code); /* known name */
if (len == 1)
return(*sp); /* single character */
SETERROR(REG_ECOLLATE); /* neither */
memset(&mbs, 0, sizeof(mbs));
if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
return (wc); /* single character */
else if (clen == (size_t)-1 || clen == (size_t)-2)
SETERROR(REG_ILLSEQ);
else
SETERROR(REG_ECOLLATE); /* neither */
return(0);
}
@ -979,16 +915,15 @@ int endc; /* name ended by endc,']' */
- othercase - return the case counterpart of an alphabetic
== static char othercase(int ch);
*/
static char /* if no counterpart, return ch */
static wint_t /* if no counterpart, return ch */
othercase(ch)
int ch;
wint_t ch;
{
ch = (uch)ch;
assert(isalpha(ch));
if (isupper(ch))
return(tolower(ch));
else if (islower(ch))
return(toupper(ch));
assert(iswalpha(ch));
if (iswupper(ch))
return(towlower(ch));
else if (iswlower(ch))
return(towupper(ch));
else /* peculiar, but could happen */
return(ch);
}
@ -1002,21 +937,24 @@ int ch;
static void
bothcases(p, ch)
struct parse *p;
int ch;
wint_t ch;
{
char *oldnext = p->next;
char *oldend = p->end;
char bracket[3];
char bracket[3 + MB_LEN_MAX];
size_t n;
mbstate_t mbs;
ch = (uch)ch;
assert(othercase(ch) != ch); /* p_bracket() would recurse */
p->next = bracket;
p->end = bracket+2;
bracket[0] = ch;
bracket[1] = ']';
bracket[2] = '\0';
memset(&mbs, 0, sizeof(mbs));
n = wcrtomb(bracket, ch, &mbs);
assert(n != (size_t)-1);
bracket[n] = ']';
bracket[n + 1] = '\0';
p->end = bracket+n+1;
p_bracket(p);
assert(p->next == bracket+2);
assert(p->next == p->end);
p->next = oldnext;
p->end = oldend;
}
@ -1028,13 +966,24 @@ int ch;
static void
ordinary(p, ch)
struct parse *p;
int ch;
wint_t ch;
{
cset *cs;
if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
bothcases(p, ch);
else
EMIT(OCHAR, (uch)ch);
else if ((ch & OPDMASK) == ch)
EMIT(OCHAR, ch);
else {
/*
* Kludge: character is too big to fit into an OCHAR operand.
* Emit a singleton set.
*/
if ((cs = allocset(p)) == NULL)
return;
CHadd(p, cs, ch);
EMIT(OANYOF, (int)(cs - p->g->sets));
}
}
/*
@ -1135,6 +1084,31 @@ int to; /* to this number of times (maybe INFINITY) */
}
}
/*
- wgetnext - helper function for WGETNEXT() macro. Gets the next wide
- character from the parse struct, signals a REG_ILLSEQ error if the
- character can't be converted. Returns the number of bytes consumed.
*/
static wint_t
wgetnext(p)
struct parse *p;
{
mbstate_t mbs;
wchar_t wc;
size_t n;
memset(&mbs, 0, sizeof(mbs));
n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
if (n == (size_t)-1 || n == (size_t)-2) {
SETERROR(REG_ILLSEQ);
return (0);
}
if (n == 0)
n = 1;
p->next += n;
return (wc);
}
/*
- seterr - set an error condition
== static int seterr(struct parse *p, int e);
@ -1159,47 +1133,16 @@ static cset *
allocset(p)
struct parse *p;
{
int no = p->g->ncsets++;
size_t nc;
size_t nbytes;
cset *cs;
size_t css = (size_t)p->g->csetsize;
int i;
cset *cs, *ncs;
if (no >= p->ncsalloc) { /* need another column of space */
p->ncsalloc += CHAR_BIT;
nc = p->ncsalloc;
assert(nc % CHAR_BIT == 0);
nbytes = nc / CHAR_BIT * css;
if (p->g->sets == NULL)
p->g->sets = (cset *)malloc(nc * sizeof(cset));
else
p->g->sets = (cset *)reallocf((char *)p->g->sets,
nc * sizeof(cset));
if (p->g->setbits == NULL)
p->g->setbits = (uch *)malloc(nbytes);
else {
p->g->setbits = (uch *)reallocf((char *)p->g->setbits,
nbytes);
/* xxx this isn't right if setbits is now NULL */
for (i = 0; i < no; i++)
p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
}
if (p->g->sets != NULL && p->g->setbits != NULL)
(void) memset((char *)p->g->setbits + (nbytes - css),
0, css);
else {
no = 0;
SETERROR(REG_ESPACE);
/* caller's responsibility not to do set ops */
}
ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
if (ncs == NULL) {
SETERROR(REG_ESPACE);
return (NULL);
}
assert(p->g->sets != NULL); /* xxx */
cs = &p->g->sets[no];
cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
cs->mask = 1 << ((no) % CHAR_BIT);
cs->hash = 0;
p->g->sets = ncs;
cs = &p->g->sets[p->g->ncsets++];
memset(cs, 0, sizeof(*cs));
return(cs);
}
@ -1213,92 +1156,124 @@ freeset(p, cs)
struct parse *p;
cset *cs;
{
int i;
cset *top = &p->g->sets[p->g->ncsets];
size_t css = (size_t)p->g->csetsize;
for (i = 0; i < css; i++)
CHsub(cs, i);
free(cs->wides);
free(cs->ranges);
free(cs->types);
memset(cs, 0, sizeof(*cs));
if (cs == top-1) /* recover only the easy case */
p->g->ncsets--;
}
/*
- freezeset - final processing on a set of characters
== static int freezeset(struct parse *p, cset *cs);
*
* The main task here is merging identical sets. This is usually a waste
* of time (although the hash code minimizes the overhead), but can win
* big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
* is done using addition rather than xor -- all ASCII [aA] sets xor to
* the same value!
- singleton - Determine whether a set contains only one character,
- returning it if so, otherwise returning OUT.
*/
static int /* set number */
freezeset(p, cs)
struct parse *p;
static wint_t
singleton(cs)
cset *cs;
{
short h = cs->hash;
int i;
cset *top = &p->g->sets[p->g->ncsets];
cset *cs2;
size_t css = (size_t)p->g->csetsize;
wint_t i, s, n;
/* look for an earlier one which is the same */
for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
if (cs2->hash == h && cs2 != cs) {
/* maybe */
for (i = 0; i < css; i++)
if (!!CHIN(cs2, i) != !!CHIN(cs, i))
break; /* no */
if (i == css)
break; /* yes */
}
if (cs2 < top) { /* found one */
freeset(p, cs);
cs = cs2;
}
return((int)(cs - p->g->sets));
}
/*
- firstch - return first character in a set (which must have at least one)
== static int firstch(struct parse *p, cset *cs);
*/
static int /* character; there is no "none" value */
firstch(p, cs)
struct parse *p;
cset *cs;
{
int i;
size_t css = (size_t)p->g->csetsize;
for (i = 0; i < css; i++)
if (CHIN(cs, i))
return((char)i);
assert(never);
return(0); /* arbitrary */
}
/*
- nch - number of characters in a set
== static int nch(struct parse *p, cset *cs);
*/
static int
nch(p, cs)
struct parse *p;
cset *cs;
{
int i;
size_t css = (size_t)p->g->csetsize;
int n = 0;
for (i = 0; i < css; i++)
if (CHIN(cs, i))
for (i = n = 0; i < NC; i++)
if (CHIN(cs, i)) {
n++;
return(n);
s = i;
}
if (n == 1)
return (s);
if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
cs->icase == 0)
return (cs->wides[0]);
/* Don't bother handling the other cases. */
return (OUT);
}
/*
- CHadd - add character to character set.
*/
static void
CHadd(p, cs, ch)
struct parse *p;
cset *cs;
wint_t ch;
{
wint_t *newwides;
assert(ch >= 0);
if (ch < NC) {
cs->bmp[ch >> 3] |= 1 << (ch & 7);
if (cs->icase) {
cs->bmp[towlower(ch) >> 3] |= 1 << (towlower(ch) & 7);
cs->bmp[towupper(ch) >> 3] |= 1 << (towupper(ch) & 7);
}
} else {
newwides = realloc(cs->wides, (cs->nwides + 1) *
sizeof(*cs->wides));
if (newwides == NULL) {
SETERROR(REG_ESPACE);
return;
}
cs->wides = newwides;
cs->wides[cs->nwides++] = ch;
}
}
/*
- CHaddrange - add all characters in the range [min,max] to a character set.
*/
static void
CHaddrange(p, cs, min, max)
struct parse *p;
cset *cs;
wint_t min, max;
{
crange *newranges;
for (; min < NC && min <= max; min++)
CHadd(p, cs, min);
if (min >= max)
return;
newranges = realloc(cs->ranges, (cs->nranges + 1) *
sizeof(*cs->ranges));
if (newranges == NULL) {
SETERROR(REG_ESPACE);
return;
}
cs->ranges = newranges;
cs->ranges[cs->nranges].min = min;
cs->ranges[cs->nranges].min = max;
cs->nranges++;
}
/*
- CHaddtype - add all characters of a certain type to a character set.
*/
static void
CHaddtype(p, cs, wct)
struct parse *p;
cset *cs;
wctype_t wct;
{
wint_t i;
wctype_t *newtypes;
for (i = 0; i < NC; i++) {
if (iswctype(i, wct))
CHadd(p, cs, i);
if (cs->icase && i != towlower(i))
CHadd(p, cs, towlower(i));
if (cs->icase && i != towupper(i))
CHadd(p, cs, towupper(i));
}
newtypes = realloc(cs->types, (cs->ntypes + 1) *
sizeof(*cs->types));
if (newtypes == NULL) {
SETERROR(REG_ESPACE);
return;
}
cs->types = newtypes;
cs->types[cs->ntypes++] = wct;
}
/*
@ -1474,13 +1449,24 @@ struct re_guts *g;
sopno newlen;
sop s;
char *cp;
sopno i;
int offset;
char buf[MB_LEN_MAX];
size_t clen;
mbstate_t mbs;
/* avoid making error situations worse */
if (p->error != 0)
return;
/*
* It's not generally safe to do a ``char'' substring search on
* multibyte character strings, but it's safe for at least
* UTF-8 (see RFC 3629).
*/
if (MB_CUR_MAX > 1 &&
strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
return;
/* find the longest OCHAR sequence in strip */
newlen = 0;
offset = 0;
@ -1490,9 +1476,14 @@ struct re_guts *g;
s = *scan++;
switch (OP(s)) {
case OCHAR: /* sequence member */
if (newlen == 0) /* new sequence */
if (newlen == 0) { /* new sequence */
memset(&mbs, 0, sizeof(mbs));
newstart = scan - 1;
newlen++;
}
clen = wcrtomb(buf, OPND(s), &mbs);
if (clen == (size_t)-1)
goto toohard;
newlen += clen;
break;
case OPLUS_: /* things that don't break one */
case OLPAREN:
@ -1569,6 +1560,7 @@ struct re_guts *g;
offset++;
newlen = 0;
break;
toohard:
default:
/* Anything here makes it impossible or too hard
* to calculate the offset -- so we give up;
@ -1603,11 +1595,13 @@ struct re_guts *g;
}
cp = g->must;
scan = start;
for (i = g->mlen; i > 0; i--) {
memset(&mbs, 0, sizeof(mbs));
while (cp < g->must + g->mlen) {
while (OP(s = *scan++) != OCHAR)
continue;
assert(cp < g->must + g->mlen);
*cp++ = (char)OPND(s);
clen = wcrtomb(cp, OPND(s), &mbs);
assert(clen != (size_t)-1);
cp += clen;
}
assert(cp == g->must + g->mlen);
*cp++ = '\0'; /* just on general principles */

View File

@ -87,7 +87,7 @@ typedef long sopno;
/* operators meaning operand */
/* (back, fwd are offsets) */
#define OEND (1L<<OPSHIFT) /* endmarker - */
#define OCHAR (2L<<OPSHIFT) /* character unsigned char */
#define OCHAR (2L<<OPSHIFT) /* character wide character */
#define OBOL (3L<<OPSHIFT) /* left anchor - */
#define OEOL (4L<<OPSHIFT) /* right anchor - */
#define OANY (5L<<OPSHIFT) /* . - */
@ -108,21 +108,63 @@ typedef long sopno;
#define OEOW (20L<<OPSHIFT) /* end word - */
/*
* Structure for [] character-set representation. Character sets are
* done as bit vectors, grouped 8 to a byte vector for compactness.
* The individual set therefore has both a pointer to the byte vector
* and a mask to pick out the relevant bit of each byte. A hash code
* simplifies testing whether two sets could be identical.
* Structures for [] character-set representation.
*/
typedef struct {
uch *ptr; /* -> uch [csetsize] */
uch mask; /* bit within array */
short hash; /* hash code */
wint_t min;
wint_t max;
} crange;
typedef struct {
unsigned char bmp[NC / 8];
wctype_t *types;
int ntypes;
wint_t *wides;
int nwides;
crange *ranges;
int nranges;
int invert;
int icase;
} cset;
/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (uch)(c))
#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (uch)(c))
#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
static int
CHIN1(cs, ch)
cset *cs;
wint_t ch;
{
int i;
assert(ch >= 0);
if (ch < NC)
return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^
cs->invert);
for (i = 0; i < cs->nwides; i++)
if (ch == cs->wides[i])
return (!cs->invert);
for (i = 0; i < cs->nranges; i++)
if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max)
return (!cs->invert);
for (i = 0; i < cs->ntypes; i++)
if (iswctype(ch, cs->types[i]))
return (!cs->invert);
return (cs->invert);
}
static __inline int
CHIN(cs, ch)
cset *cs;
wint_t ch;
{
assert(ch >= 0);
if (ch < NC)
return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^
cs->invert);
else if (cs->icase)
return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) ||
CHIN1(cs, towupper(ch)));
else
return (CHIN1(cs, ch));
}
/*
* main compiled-expression structure
@ -131,10 +173,8 @@ struct re_guts {
int magic;
# define MAGIC2 ((('R'^0200)<<8)|'E')
sop *strip; /* malloced area for strip */
int csetsize; /* number of bits in a cset vector */
int ncsets; /* number of csets in use */
cset *sets; /* -> cset [ncsets] */
uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */
int cflags; /* copy of regcomp() cflags argument */
sopno nstates; /* = number of sops */
sopno firststate; /* the initial OEND (normally 0) */
@ -156,5 +196,5 @@ struct re_guts {
};
/* misc utilities */
#define OUT (CHAR_MAX+1) /* a non-character value */
#define ISWORD(c) (isalnum((uch)(c)) || (c) == '_')
#define OUT (-2) /* a non-character value */
#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_')

View File

@ -46,9 +46,9 @@ __FBSDID("$FreeBSD$");
/*
* the outer shell of regexec()
*
* This file includes engine.c *twice*, after muchos fiddling with the
* This file includes engine.c three times, after muchos fiddling with the
* macros that code uses. This lets the same code operate on two different
* representations for state sets.
* representations for state sets and characters.
*/
#include <sys/types.h>
#include <stdio.h>
@ -57,12 +57,53 @@ __FBSDID("$FreeBSD$");
#include <limits.h>
#include <ctype.h>
#include <regex.h>
#include <wchar.h>
#include <wctype.h>
#include "utils.h"
#include "regex2.h"
static int nope __unused = 0; /* for use in asserts; shuts lint up */
static __inline size_t
xmbrtowc(wi, s, n, mbs, dummy)
wint_t *wi;
const char *s;
size_t n;
mbstate_t *mbs;
wint_t dummy;
{
size_t nr;
wchar_t wc;
nr = mbrtowc(&wc, s, n, mbs);
if (wi != NULL)
*wi = wc;
if (nr == 0)
return (1);
else if (nr == (size_t)-1 || nr == (size_t)-2) {
memset(mbs, 0, sizeof(*mbs));
if (wi != NULL)
*wi = dummy;
return (1);
} else
return (nr);
}
static __inline size_t
xmbrtowc_dummy(wi, s, n, mbs, dummy)
wint_t *wi;
const char *s;
size_t n __unused;
mbstate_t *mbs __unused;
wint_t dummy __unused;
{
if (wi != NULL)
*wi = (unsigned char)*s;
return (1);
}
/* macros for manipulating states, small version */
#define states long
#define states1 states /* for later use in regexec() decision */
@ -85,6 +126,9 @@ static int nope __unused = 0; /* for use in asserts; shuts lint up */
#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n))
#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n))
#define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0)
/* no multibyte support */
#define XMBRTOWC xmbrtowc_dummy
#define ZAPSTATE(mbs) ((void)(mbs))
/* function names */
#define SNAMES /* engine.c looks after details */
@ -110,6 +154,8 @@ static int nope __unused = 0; /* for use in asserts; shuts lint up */
#undef BACK
#undef ISSETBACK
#undef SNAMES
#undef XMBRTOWC
#undef ZAPSTATE
/* macros for manipulating states, large version */
#define states char *
@ -134,11 +180,24 @@ static int nope __unused = 0; /* for use in asserts; shuts lint up */
#define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here])
#define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here])
#define ISSETBACK(v, n) ((v)[here - (n)])
/* no multibyte support */
#define XMBRTOWC xmbrtowc_dummy
#define ZAPSTATE(mbs) ((void)(mbs))
/* function names */
#define LNAMES /* flag */
#include "engine.c"
/* multibyte character & large states version */
#undef LNAMES
#undef XMBRTOWC
#undef ZAPSTATE
#define XMBRTOWC xmbrtowc
#define ZAPSTATE(mbs) memset((mbs), 0, sizeof(*(mbs)))
#define MNAMES
#include "engine.c"
/*
- regexec - interface for matching
= extern int regexec(const regex_t *, const char *, size_t, \
@ -176,7 +235,9 @@ int eflags;
return(REG_BADPAT);
eflags = GOODFLAGS(eflags);
if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags&REG_LARGE))
if (MB_CUR_MAX > 1)
return(mmatcher(g, (char *)string, nmatch, pmatch, eflags));
else if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags&REG_LARGE))
return(smatcher(g, (char *)string, nmatch, pmatch, eflags));
else
return(lmatcher(g, (char *)string, nmatch, pmatch, eflags));

View File

@ -48,6 +48,8 @@ __FBSDID("$FreeBSD$");
#include <stdlib.h>
#include <limits.h>
#include <regex.h>
#include <wchar.h>
#include <wctype.h>
#include "utils.h"
#include "regex2.h"
@ -61,6 +63,7 @@ regfree(preg)
regex_t *preg;
{
struct re_guts *g;
int i;
if (preg->re_magic != MAGIC1) /* oops */
return; /* nice to complain, but hard */
@ -73,10 +76,14 @@ regex_t *preg;
if (g->strip != NULL)
free((char *)g->strip);
if (g->sets != NULL)
if (g->sets != NULL) {
for (i = 0; i < g->ncsets; i++) {
free(g->sets[i].ranges);
free(g->sets[i].wides);
free(g->sets[i].types);
}
free((char *)g->sets);
if (g->setbits != NULL)
free((char *)g->setbits);
}
if (g->must != NULL)
free(g->must);
if (g->charjump != NULL)