From a0bf5d8a682d6137a5ec7fd834d9742376f2c9c8 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Wed, 9 Aug 2017 01:04:36 +0000 Subject: [PATCH] regex(3): Refactor fast/slow stepping bits in the matching engine Adding features for matching is fairly straightforward, but this requires some duplication because of this fast/slow setup. They can be fairly trivially combined into a single walk(), so do it to make future additions less error prone. Reviewed by: cem (earlier version), emaste, pfg Approved by: emaste (mentor) Differential Revision: https://reviews.freebsd.org/D11233 --- lib/libc/regex/engine.c | 192 +++++++++++----------------------------- 1 file changed, 50 insertions(+), 142 deletions(-) diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c index 073e0010882e..7c70b685437d 100644 --- a/lib/libc/regex/engine.c +++ b/lib/libc/regex/engine.c @@ -36,6 +36,8 @@ #include __FBSDID("$FreeBSD$"); +#include + /* * The matching engine and friends. This file is #included by regexec.c * after suitable #defines of a variety of macros used herein, so that @@ -45,8 +47,7 @@ __FBSDID("$FreeBSD$"); #ifdef SNAMES #define matcher smatcher -#define fast sfast -#define slow sslow +#define walk swalk #define dissect sdissect #define backref sbackref #define step sstep @@ -56,8 +57,7 @@ __FBSDID("$FreeBSD$"); #endif #ifdef LNAMES #define matcher lmatcher -#define fast lfast -#define slow lslow +#define walk lwalk #define dissect ldissect #define backref lbackref #define step lstep @@ -67,8 +67,7 @@ __FBSDID("$FreeBSD$"); #endif #ifdef MNAMES #define matcher mmatcher -#define fast mfast -#define slow mslow +#define walk mwalk #define dissect mdissect #define backref mbackref #define step mstep @@ -104,8 +103,7 @@ extern "C" { static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); -static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); -static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); +static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); #define MAX_RECURSION 100 #define BOL (OUT-1) @@ -251,7 +249,7 @@ matcher(struct re_guts *g, /* this loop does only one repetition except for backrefs */ for (;;) { - endp = fast(m, start, stop, gf, gl); + endp = walk(m, start, stop, gf, gl, true); if (endp == NULL) { /* a miss */ if (m->pmatch != NULL) free((char *)m->pmatch); @@ -267,7 +265,7 @@ matcher(struct re_guts *g, assert(m->coldp != NULL); for (;;) { NOTE("finding start"); - endp = slow(m, m->coldp, stop, gf, gl); + endp = walk(m, m->coldp, stop, gf, gl, false); if (endp != NULL) break; assert(m->coldp < m->endp); @@ -312,7 +310,7 @@ matcher(struct re_guts *g, if (dp != NULL || endp <= m->coldp) break; /* defeat */ NOTE("backoff"); - endp = slow(m, m->coldp, endp-1, gf, gl); + endp = walk(m, m->coldp, endp-1, gf, gl, false); if (endp == NULL) break; /* defeat */ /* try it on a shorter possibility */ @@ -430,10 +428,10 @@ dissect(struct match *m, stp = stop; for (;;) { /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); + rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); + tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ @@ -443,7 +441,7 @@ dissect(struct match *m, ssub = ss + 1; esub = es - 1; /* did innards match? */ - if (slow(m, sp, rest, ssub, esub) != NULL) { + if (walk(m, sp, rest, ssub, esub, false) != NULL) { dp = dissect(m, sp, rest, ssub, esub); assert(dp == rest); } else /* no */ @@ -454,10 +452,10 @@ dissect(struct match *m, stp = stop; for (;;) { /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); + rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); + tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ @@ -469,7 +467,7 @@ dissect(struct match *m, ssp = sp; oldssp = ssp; for (;;) { /* find last match of innards */ - sep = slow(m, ssp, rest, ssub, esub); + sep = walk(m, ssp, rest, ssub, esub, false); if (sep == NULL || sep == ssp) break; /* failed or matched null */ oldssp = ssp; /* on to next try */ @@ -481,7 +479,7 @@ dissect(struct match *m, ssp = oldssp; } assert(sep == rest); /* must exhaust substring */ - assert(slow(m, ssp, sep, ssub, esub) == rest); + assert(walk(m, ssp, sep, ssub, esub, false) == rest); dp = dissect(m, ssp, sep, ssub, esub); assert(dp == sep); sp = rest; @@ -490,10 +488,10 @@ dissect(struct match *m, stp = stop; for (;;) { /* how long could this one be? */ - rest = slow(m, sp, stp, ss, es); + rest = walk(m, sp, stp, ss, es, false); assert(rest != NULL); /* it did match */ /* could the rest match the rest? */ - tail = slow(m, rest, stop, es, stopst); + tail = walk(m, rest, stop, es, stopst, false); if (tail == stop) break; /* yes! */ /* no -- try a shorter match for this one */ @@ -504,7 +502,7 @@ dissect(struct match *m, esub = ss + OPND(m->g->strip[ss]) - 1; assert(OP(m->g->strip[esub]) == OOR1); for (;;) { /* find first matching branch */ - if (slow(m, sp, rest, ssub, esub) == rest) + if (walk(m, sp, rest, ssub, esub, false) == rest) break; /* it matched all of it */ /* that one missed, try next one */ assert(OP(m->g->strip[esub]) == OOR1); @@ -757,124 +755,16 @@ backref(struct match *m, } /* - - fast - step through the string at top speed - == static const char *fast(struct match *m, const char *start, \ - == const char *stop, sopno startst, sopno stopst); + - walk - step through the string either quickly or slowly + == static const char *walk(struct match *m, const char *start, \ + == const char *stop, sopno startst, sopno stopst, bool fast); */ -static const char * /* where tentative match ended, or NULL */ -fast( struct match *m, - const char *start, - const char *stop, - sopno startst, - sopno stopst) +static const char * /* where it ended, or NULL */ +walk(struct match *m, const char *start, const char *stop, sopno startst, + sopno stopst, bool fast) { states st = m->st; states fresh = m->fresh; - states tmp = m->tmp; - const char *p = start; - wint_t c; - wint_t lastc; /* previous c */ - wint_t flagch; - int i; - const char *coldp; /* last p after which no match was underway */ - size_t clen; - - CLEAR(st); - SET1(st, startst); - SP("fast", st, *p); - st = step(m->g, startst, stopst, st, NOTHING, st); - ASSIGN(fresh, st); - SP("start", st, *p); - coldp = NULL; - if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) - c = OUT; - else { - /* - * XXX Wrong if the previous character was multi-byte. - * Newline never is (in encodings supported by FreeBSD), - * so this only breaks the ISWORD tests below. - */ - c = (uch)*(start - 1); - } - for (;;) { - /* next character */ - lastc = c; - if (p == m->endp) { - clen = 0; - c = OUT; - } else - clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); - if (EQ(st, fresh)) - coldp = p; - - /* is there an EOL and/or BOL between lastc and c? */ - flagch = '\0'; - i = 0; - if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || - (lastc == OUT && !(m->eflags®_NOTBOL)) ) { - flagch = BOL; - i = m->g->nbol; - } - if ( (c == '\n' && m->g->cflags®_NEWLINE) || - (c == OUT && !(m->eflags®_NOTEOL)) ) { - flagch = (flagch == BOL) ? BOLEOL : EOL; - i += m->g->neol; - } - if (i != 0) { - for (; i > 0; i--) - st = step(m->g, startst, stopst, st, flagch, st); - SP("boleol", st, c); - } - - /* how about a word boundary? */ - if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && - (c != OUT && ISWORD(c)) ) { - flagch = BOW; - } - if ( (lastc != OUT && ISWORD(lastc)) && - (flagch == EOL || (c != OUT && !ISWORD(c))) ) { - flagch = EOW; - } - if (flagch == BOW || flagch == EOW) { - st = step(m->g, startst, stopst, st, flagch, st); - SP("boweow", st, c); - } - - /* are we done? */ - if (ISSET(st, stopst) || p == stop || clen > stop - p) - break; /* NOTE BREAK OUT */ - - /* no, we must deal with this character */ - ASSIGN(tmp, st); - ASSIGN(st, fresh); - assert(c != OUT); - st = step(m->g, startst, stopst, tmp, c, st); - SP("aft", st, c); - assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p += clen; - } - - assert(coldp != NULL); - m->coldp = coldp; - if (ISSET(st, stopst)) - return(p+XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); - else - return(NULL); -} - -/* - - slow - step through the string more deliberately - == static const char *slow(struct match *m, const char *start, \ - == const char *stop, sopno startst, sopno stopst); - */ -static const char * /* where it ended */ -slow( struct match *m, - const char *start, - const char *stop, - sopno startst, - sopno stopst) -{ - states st = m->st; states empty = m->empty; states tmp = m->tmp; const char *p = start; @@ -890,6 +780,8 @@ slow( struct match *m, SET1(st, startst); SP("sstart", st, *p); st = step(m->g, startst, stopst, st, NOTHING, st); + if (fast) + ASSIGN(fresh, st); matchp = NULL; if (start == m->offp || (start == m->beginp && !(m->eflags®_NOTBOL))) c = OUT; @@ -910,6 +802,9 @@ slow( struct match *m, } else clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); + if (fast && EQ(st, fresh)) + matchp = p; + /* is there an EOL and/or BOL between lastc and c? */ flagch = '\0'; i = 0; @@ -944,14 +839,21 @@ slow( struct match *m, } /* are we done? */ - if (ISSET(st, stopst)) - matchp = p; + if (ISSET(st, stopst)) { + if (fast) + break; + else + matchp = p; + } if (EQ(st, empty) || p == stop || clen > stop - p) break; /* NOTE BREAK OUT */ /* no, we must deal with this character */ ASSIGN(tmp, st); - ASSIGN(st, empty); + if (fast) + ASSIGN(st, fresh); + else + ASSIGN(st, empty); assert(c != OUT); st = step(m->g, startst, stopst, tmp, c, st); SP("saft", st, c); @@ -959,10 +861,17 @@ slow( struct match *m, p += clen; } - return(matchp); + if (fast) { + assert(matchp != NULL); + m->coldp = matchp; + if (ISSET(st, stopst)) + return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0)); + else + return (NULL); + } else + return (matchp); } - /* - step - map set of states reachable before char to set reachable after == static states step(struct re_guts *g, sopno start, sopno stop, \ @@ -1173,8 +1082,7 @@ pchar(int ch) #endif #undef matcher -#undef fast -#undef slow +#undef walk #undef dissect #undef backref #undef step