Kyle Evans 6b986646d4 libregex: implement \b and \B (word boundary, not word boundary)
This is the last of the needed GNU expressions before we can unleash bsdgrep
by default. \b is effectively an agnostic equivalent of \< and \>, while
\B will match every space that isn't making a transition from
nonchar -> char or char -> nonchar.
2020-12-05 03:16:05 +00:00

1219 lines
31 KiB
C

/*-
* SPDX-License-Identifier: BSD-3-Clause
*
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
* Copyright (c) 1992, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Henry Spencer.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)engine.c 8.5 (Berkeley) 3/20/94
*/
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <stdbool.h>
/*
* The matching engine and friends. This file is #included by regexec.c
* after suitable #defines of a variety of macros used herein, so that
* different state representations can be used without duplicating masses
* of code.
*/
#ifdef SNAMES
#define stepback sstepback
#define matcher smatcher
#define walk swalk
#define dissect sdissect
#define backref sbackref
#define step sstep
#define print sprint
#define at sat
#define match smat
#endif
#ifdef LNAMES
#define stepback lstepback
#define matcher lmatcher
#define walk lwalk
#define dissect ldissect
#define backref lbackref
#define step lstep
#define print lprint
#define at lat
#define match lmat
#endif
#ifdef MNAMES
#define stepback mstepback
#define matcher mmatcher
#define walk mwalk
#define dissect mdissect
#define backref mbackref
#define step mstep
#define print mprint
#define at mat
#define match mmat
#endif
/* another structure passed up and down to avoid zillions of parameters */
struct match {
struct re_guts *g;
int eflags;
regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
const char *offp; /* offsets work from here */
const char *beginp; /* start of string -- virtual NUL precedes */
const char *endp; /* end of string -- virtual NUL here */
const char *coldp; /* can be no match starting before here */
const char **lastpos; /* [nplus+1] */
STATEVARS;
states st; /* current states */
states fresh; /* states for a fresh start */
states tmp; /* temporary */
states empty; /* empty set of states */
mbstate_t mbs; /* multibyte conversion state */
};
/* ========= begin header generated by ./mkh ========= */
#ifdef __cplusplus
extern "C" {
#endif
/* === engine.c === */
static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast);
static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags);
#define MAX_RECURSION 100
#define BOL (OUT-1)
#define EOL (BOL-1)
#define BOLEOL (BOL-2)
#define NOTHING (BOL-3)
#define BOW (BOL-4)
#define EOW (BOL-5)
#define BADCHAR (BOL-6)
#define NWBND (BOL-7)
#define NONCHAR(c) ((c) <= OUT)
/* sflags */
#define SBOS 0x0001
#define SEOS 0x0002
#ifdef REDEBUG
static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
#endif
#ifdef REDEBUG
static void at(struct match *m, const char *title, const char *start, const char *stop, sopno startst, sopno stopst);
#endif
#ifdef REDEBUG
static const char *pchar(int ch);
#endif
#ifdef __cplusplus
}
#endif
/* ========= end header generated by ./mkh ========= */
#ifdef REDEBUG
#define SP(t, s, c) print(m, t, s, c, stdout)
#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
#define NOTE(str) { if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
#else
#define SP(t, s, c) /* nothing */
#define AT(t, p1, p2, s1, s2) /* nothing */
#define NOTE(s) /* nothing */
#endif
/*
* Given a multibyte string pointed to by start, step back nchar characters
* from current position pointed to by cur.
*/
static const char *
stepback(const char *start, const char *cur, int nchar)
{
const char *ret;
int wc, mbc;
mbstate_t mbs;
size_t clen;
if (MB_CUR_MAX == 1)
return ((cur - nchar) > start ? cur - nchar : NULL);
ret = cur;
for (wc = nchar; wc > 0; wc--) {
for (mbc = 1; mbc <= MB_CUR_MAX; mbc++) {
if ((ret - mbc) < start)
return (NULL);
memset(&mbs, 0, sizeof(mbs));
clen = mbrtowc(NULL, ret - mbc, mbc, &mbs);
if (clen != (size_t)-1 && clen != (size_t)-2)
break;
}
if (mbc > MB_CUR_MAX)
return (NULL);
ret -= mbc;
}
return (ret);
}
/*
- matcher - the actual matching engine
== static int matcher(struct re_guts *g, const char *string, \
== size_t nmatch, regmatch_t pmatch[], int eflags);
*/
static int /* 0 success, REG_NOMATCH failure */
matcher(struct re_guts *g,
const char *string,
size_t nmatch,
regmatch_t pmatch[],
int eflags)
{
const char *endp;
size_t i;
struct match mv;
struct match *m = &mv;
const char *dp = NULL;
const sopno gf = g->firststate+1; /* +1 for OEND */
const sopno gl = g->laststate;
const char *start;
const char *stop;
/* Boyer-Moore algorithms variables */
const char *pp;
int cj, mj;
const char *mustfirst;
const char *mustlast;
int *matchjump;
int *charjump;
/* simplify the situation where possible */
if (g->cflags&REG_NOSUB)
nmatch = 0;
if (eflags&REG_STARTEND) {
start = string + pmatch[0].rm_so;
stop = string + pmatch[0].rm_eo;
} else {
start = string;
stop = start + strlen(start);
}
if (stop < start)
return(REG_INVARG);
/* prescreening; this does wonders for this rather slow code */
if (g->must != NULL) {
if (g->charjump != NULL && g->matchjump != NULL) {
mustfirst = g->must;
mustlast = g->must + g->mlen - 1;
charjump = g->charjump;
matchjump = g->matchjump;
pp = mustlast;
for (dp = start+g->mlen-1; dp < stop;) {
/* Fast skip non-matches */
while (dp < stop && charjump[(int)*dp])
dp += charjump[(int)*dp];
if (dp >= stop)
break;
/* Greedy matcher */
/* We depend on not being used for
* for strings of length 1
*/
while (*--dp == *--pp && pp != mustfirst);
if (*dp == *pp)
break;
/* Jump to next possible match */
mj = matchjump[pp - mustfirst];
cj = charjump[(int)*dp];
dp += (cj < mj ? mj : cj);
pp = mustlast;
}
if (pp != mustfirst)
return(REG_NOMATCH);
} else {
for (dp = start; dp < stop; dp++)
if (*dp == g->must[0] &&
stop - dp >= g->mlen &&
memcmp(dp, g->must, (size_t)g->mlen) == 0)
break;
if (dp == stop) /* we didn't find g->must */
return(REG_NOMATCH);
}
}
/* match struct setup */
m->g = g;
m->eflags = eflags;
m->pmatch = NULL;
m->lastpos = NULL;
m->offp = string;
m->beginp = start;
m->endp = stop;
STATESETUP(m, 4);
SETUP(m->st);
SETUP(m->fresh);
SETUP(m->tmp);
SETUP(m->empty);
CLEAR(m->empty);
ZAPSTATE(&m->mbs);
/* Adjust start according to moffset, to speed things up */
if (dp != NULL && g->moffset > -1) {
const char *nstart;
nstart = stepback(start, dp, g->moffset);
if (nstart != NULL)
start = nstart;
}
SP("mloop", m->st, *start);
/* this loop does only one repetition except for backrefs */
for (;;) {
endp = walk(m, start, stop, gf, gl, true);
if (endp == NULL) { /* a miss */
if (m->pmatch != NULL)
free((char *)m->pmatch);
if (m->lastpos != NULL)
free((char *)m->lastpos);
STATETEARDOWN(m);
return(REG_NOMATCH);
}
if (nmatch == 0 && !g->backrefs)
break; /* no further info needed */
/* where? */
assert(m->coldp != NULL);
for (;;) {
NOTE("finding start");
endp = walk(m, m->coldp, stop, gf, gl, false);
if (endp != NULL)
break;
assert(m->coldp < m->endp);
m->coldp += XMBRTOWC(NULL, m->coldp,
m->endp - m->coldp, &m->mbs, 0);
}
if (nmatch == 1 && !g->backrefs)
break; /* no further info needed */
/* oh my, he wants the subexpressions... */
if (m->pmatch == NULL)
m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
sizeof(regmatch_t));
if (m->pmatch == NULL) {
STATETEARDOWN(m);
return(REG_ESPACE);
}
for (i = 1; i <= m->g->nsub; i++)
m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
if (!g->backrefs && !(m->eflags&REG_BACKR)) {
NOTE("dissecting");
dp = dissect(m, m->coldp, endp, gf, gl);
} else {
if (g->nplus > 0 && m->lastpos == NULL)
m->lastpos = malloc((g->nplus+1) *
sizeof(const char *));
if (g->nplus > 0 && m->lastpos == NULL) {
free(m->pmatch);
STATETEARDOWN(m);
return(REG_ESPACE);
}
NOTE("backref dissect");
dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
}
if (dp != NULL)
break;
/* uh-oh... we couldn't find a subexpression-level match */
assert(g->backrefs); /* must be back references doing it */
assert(g->nplus == 0 || m->lastpos != NULL);
for (;;) {
if (dp != NULL || endp <= m->coldp)
break; /* defeat */
NOTE("backoff");
endp = walk(m, m->coldp, endp-1, gf, gl, false);
if (endp == NULL)
break; /* defeat */
/* try it on a shorter possibility */
#ifndef NDEBUG
for (i = 1; i <= m->g->nsub; i++) {
assert(m->pmatch[i].rm_so == -1);
assert(m->pmatch[i].rm_eo == -1);
}
#endif
NOTE("backoff dissect");
dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
}
assert(dp == NULL || dp == endp);
if (dp != NULL) /* found a shorter one */
break;
/* despite initial appearances, there is no match here */
NOTE("false alarm");
/* recycle starting later */
start = m->coldp + XMBRTOWC(NULL, m->coldp,
stop - m->coldp, &m->mbs, 0);
assert(start <= stop);
}
/* fill in the details if requested */
if (nmatch > 0) {
pmatch[0].rm_so = m->coldp - m->offp;
pmatch[0].rm_eo = endp - m->offp;
}
if (nmatch > 1) {
assert(m->pmatch != NULL);
for (i = 1; i < nmatch; i++)
if (i <= m->g->nsub)
pmatch[i] = m->pmatch[i];
else {
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
}
}
if (m->pmatch != NULL)
free((char *)m->pmatch);
if (m->lastpos != NULL)
free((char *)m->lastpos);
STATETEARDOWN(m);
return(0);
}
/*
- dissect - figure out what matched what, no back references
== static const char *dissect(struct match *m, const char *start, \
== const char *stop, sopno startst, sopno stopst);
*/
static const char * /* == stop (success) always */
dissect(struct match *m,
const char *start,
const char *stop,
sopno startst,
sopno stopst)
{
int i;
sopno ss; /* start sop of current subRE */
sopno es; /* end sop of current subRE */
const char *sp; /* start of string matched by it */
const char *stp; /* string matched by it cannot pass here */
const char *rest; /* start of rest of string */
const char *tail; /* string unmatched by rest of RE */
sopno ssub; /* start sop of subsubRE */
sopno esub; /* end sop of subsubRE */
const char *ssp; /* start of string matched by subsubRE */
const char *sep; /* end of string matched by subsubRE */
const char *oldssp; /* previous ssp */
const char *dp __unused;
AT("diss", start, stop, startst, stopst);
sp = start;
for (ss = startst; ss < stopst; ss = es) {
/* identify end of subRE */
es = ss;
switch (OP(m->g->strip[es])) {
case OPLUS_:
case OQUEST_:
es += OPND(m->g->strip[es]);
break;
case OCH_:
while (OP(m->g->strip[es]) != (sop)O_CH)
es += OPND(m->g->strip[es]);
break;
}
es++;
/* figure out what it matched */
switch (OP(m->g->strip[ss])) {
case OEND:
assert(nope);
break;
case OCHAR:
sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
break;
case OBOL:
case OEOL:
case OBOW:
case OEOW:
case OBOS:
case OEOS:
case OWBND:
case ONWBND:
break;
case OANY:
case OANYOF:
sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
break;
case OBACK_:
case O_BACK:
assert(nope);
break;
/* cases where length of match is hard to find */
case OQUEST_:
stp = stop;
for (;;) {
/* how long could this one be? */
rest = walk(m, sp, stp, ss, es, false);
assert(rest != NULL); /* it did match */
/* could the rest match the rest? */
tail = walk(m, rest, stop, es, stopst, false);
if (tail == stop)
break; /* yes! */
/* no -- try a shorter match for this one */
stp = rest - 1;
assert(stp >= sp); /* it did work */
}
ssub = ss + 1;
esub = es - 1;
/* did innards match? */
if (walk(m, sp, rest, ssub, esub, false) != NULL) {
dp = dissect(m, sp, rest, ssub, esub);
assert(dp == rest);
} else /* no */
assert(sp == rest);
sp = rest;
break;
case OPLUS_:
stp = stop;
for (;;) {
/* how long could this one be? */
rest = walk(m, sp, stp, ss, es, false);
assert(rest != NULL); /* it did match */
/* could the rest match the rest? */
tail = walk(m, rest, stop, es, stopst, false);
if (tail == stop)
break; /* yes! */
/* no -- try a shorter match for this one */
stp = rest - 1;
assert(stp >= sp); /* it did work */
}
ssub = ss + 1;
esub = es - 1;
ssp = sp;
oldssp = ssp;
for (;;) { /* find last match of innards */
sep = walk(m, ssp, rest, ssub, esub, false);
if (sep == NULL || sep == ssp)
break; /* failed or matched null */
oldssp = ssp; /* on to next try */
ssp = sep;
}
if (sep == NULL) {
/* last successful match */
sep = ssp;
ssp = oldssp;
}
assert(sep == rest); /* must exhaust substring */
assert(walk(m, ssp, sep, ssub, esub, false) == rest);
dp = dissect(m, ssp, sep, ssub, esub);
assert(dp == sep);
sp = rest;
break;
case OCH_:
stp = stop;
for (;;) {
/* how long could this one be? */
rest = walk(m, sp, stp, ss, es, false);
assert(rest != NULL); /* it did match */
/* could the rest match the rest? */
tail = walk(m, rest, stop, es, stopst, false);
if (tail == stop)
break; /* yes! */
/* no -- try a shorter match for this one */
stp = rest - 1;
assert(stp >= sp); /* it did work */
}
ssub = ss + 1;
esub = ss + OPND(m->g->strip[ss]) - 1;
assert(OP(m->g->strip[esub]) == OOR1);
for (;;) { /* find first matching branch */
if (walk(m, sp, rest, ssub, esub, false) == rest)
break; /* it matched all of it */
/* that one missed, try next one */
assert(OP(m->g->strip[esub]) == OOR1);
esub++;
assert(OP(m->g->strip[esub]) == OOR2);
ssub = esub + 1;
esub += OPND(m->g->strip[esub]);
if (OP(m->g->strip[esub]) == (sop)OOR2)
esub--;
else
assert(OP(m->g->strip[esub]) == O_CH);
}
dp = dissect(m, sp, rest, ssub, esub);
assert(dp == rest);
sp = rest;
break;
case O_PLUS:
case O_QUEST:
case OOR1:
case OOR2:
case O_CH:
assert(nope);
break;
case OLPAREN:
i = OPND(m->g->strip[ss]);
assert(0 < i && i <= m->g->nsub);
m->pmatch[i].rm_so = sp - m->offp;
break;
case ORPAREN:
i = OPND(m->g->strip[ss]);
assert(0 < i && i <= m->g->nsub);
m->pmatch[i].rm_eo = sp - m->offp;
break;
default: /* uh oh */
assert(nope);
break;
}
}
assert(sp == stop);
return(sp);
}
#define ISBOW(m, sp) \
(sp < m->endp && ISWORD(*sp) && \
((sp == m->beginp && !(m->eflags&REG_NOTBOL)) || \
(sp > m->offp && !ISWORD(*(sp-1)))))
#define ISEOW(m, sp) \
(((sp == m->endp && !(m->eflags&REG_NOTEOL)) || \
(sp < m->endp && *sp == '\n' && \
(m->g->cflags&REG_NEWLINE)) || \
(sp < m->endp && !ISWORD(*sp)) ) && \
(sp > m->beginp && ISWORD(*(sp-1)))) \
/*
- backref - figure out what matched what, figuring in back references
== static const char *backref(struct match *m, const char *start, \
== const char *stop, sopno startst, sopno stopst, sopno lev);
*/
static const char * /* == stop (success) or NULL (failure) */
backref(struct match *m,
const char *start,
const char *stop,
sopno startst,
sopno stopst,
sopno lev, /* PLUS nesting level */
int rec)
{
int i;
sopno ss; /* start sop of current subRE */
const char *sp; /* start of string matched by it */
sopno ssub; /* start sop of subsubRE */
sopno esub; /* end sop of subsubRE */
const char *ssp; /* start of string matched by subsubRE */
const char *dp;
size_t len;
int hard;
sop s;
regoff_t offsave;
cset *cs;
wint_t wc;
AT("back", start, stop, startst, stopst);
sp = start;
/* get as far as we can with easy stuff */
hard = 0;
for (ss = startst; !hard && ss < stopst; ss++)
switch (OP(s = m->g->strip[ss])) {
case OCHAR:
if (sp == stop)
return(NULL);
sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
if (wc != OPND(s))
return(NULL);
break;
case OANY:
if (sp == stop)
return(NULL);
sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
if (wc == BADCHAR)
return (NULL);
break;
case OANYOF:
if (sp == stop)
return (NULL);
cs = &m->g->sets[OPND(s)];
sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
if (wc == BADCHAR || !CHIN(cs, wc))
return(NULL);
break;
case OBOS:
if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0)
{ /* yes */ }
else
return(NULL);
break;
case OEOS:
if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0)
{ /* yes */ }
else
return(NULL);
break;
case OBOL:
if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
(sp > m->offp && sp < m->endp &&
*(sp-1) == '\n' && (m->g->cflags&REG_NEWLINE)))
{ /* yes */ }
else
return(NULL);
break;
case OEOL:
if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
(sp < m->endp && *sp == '\n' &&
(m->g->cflags&REG_NEWLINE)) )
{ /* yes */ }
else
return(NULL);
break;
case OWBND:
if (ISBOW(m, sp) || ISEOW(m, sp))
{ /* yes */ }
else
return(NULL);
break;
case ONWBND:
if (((sp == m->beginp) && !ISWORD(*sp)) ||
(sp == m->endp && !ISWORD(*(sp - 1))))
{ /* yes, beginning/end of subject */ }
else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
{ /* yes, beginning/end of subject */ }
else
return(NULL);
break;
case OBOW:
if (ISBOW(m, sp))
{ /* yes */ }
else
return(NULL);
break;
case OEOW:
if (ISEOW(m, sp))
{ /* yes */ }
else
return(NULL);
break;
case O_QUEST:
break;
case OOR1: /* matches null but needs to skip */
ss++;
s = m->g->strip[ss];
do {
assert(OP(s) == OOR2);
ss += OPND(s);
} while (OP(s = m->g->strip[ss]) != (sop)O_CH);
/* note that the ss++ gets us past the O_CH */
break;
default: /* have to make a choice */
hard = 1;
break;
}
if (!hard) { /* that was it! */
if (sp != stop)
return(NULL);
return(sp);
}
ss--; /* adjust for the for's final increment */
/* the hard stuff */
AT("hard", sp, stop, ss, stopst);
s = m->g->strip[ss];
switch (OP(s)) {
case OBACK_: /* the vilest depths */
i = OPND(s);
assert(0 < i && i <= m->g->nsub);
if (m->pmatch[i].rm_eo == -1)
return(NULL);
assert(m->pmatch[i].rm_so != -1);
len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
if (len == 0 && rec++ > MAX_RECURSION)
return(NULL);
assert(stop - m->beginp >= len);
if (sp > stop - len)
return(NULL); /* not enough left to match */
ssp = m->offp + m->pmatch[i].rm_so;
if (memcmp(sp, ssp, len) != 0)
return(NULL);
while (m->g->strip[ss] != (sop)SOP(O_BACK, i))
ss++;
return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
case OQUEST_: /* to null or not */
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return(dp); /* not */
return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
case OPLUS_:
assert(m->lastpos != NULL);
assert(lev+1 <= m->g->nplus);
m->lastpos[lev+1] = sp;
return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
case O_PLUS:
if (sp == m->lastpos[lev]) /* last pass matched null */
return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
/* try another pass */
m->lastpos[lev] = sp;
dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
if (dp == NULL)
return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
else
return(dp);
case OCH_: /* find the right one, if any */
ssub = ss + 1;
esub = ss + OPND(s) - 1;
assert(OP(m->g->strip[esub]) == OOR1);
for (;;) { /* find first matching branch */
dp = backref(m, sp, stop, ssub, esub, lev, rec);
if (dp != NULL)
return(dp);
/* that one missed, try next one */
if (OP(m->g->strip[esub]) == (sop)O_CH)
return(NULL); /* there is none */
esub++;
assert(OP(m->g->strip[esub]) == (sop)OOR2);
ssub = esub + 1;
esub += OPND(m->g->strip[esub]);
if (OP(m->g->strip[esub]) == (sop)OOR2)
esub--;
else
assert(OP(m->g->strip[esub]) == O_CH);
}
/* NOTREACHED */
break;
case OLPAREN: /* must undo assignment if rest fails */
i = OPND(s);
assert(0 < i && i <= m->g->nsub);
offsave = m->pmatch[i].rm_so;
m->pmatch[i].rm_so = sp - m->offp;
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return(dp);
m->pmatch[i].rm_so = offsave;
return(NULL);
case ORPAREN: /* must undo assignment if rest fails */
i = OPND(s);
assert(0 < i && i <= m->g->nsub);
offsave = m->pmatch[i].rm_eo;
m->pmatch[i].rm_eo = sp - m->offp;
dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
if (dp != NULL)
return(dp);
m->pmatch[i].rm_eo = offsave;
return(NULL);
default: /* uh oh */
assert(nope);
break;
}
/* "can't happen" */
assert(nope);
/* NOTREACHED */
return "shut up gcc";
}
/*
- walk - step through the string either quickly or slowly
== static const char *walk(struct match *m, const char *start, \
== const char *stop, sopno startst, sopno stopst, bool fast);
*/
static const char * /* where it ended, or NULL */
walk(struct match *m, const char *start, const char *stop, sopno startst,
sopno stopst, bool fast)
{
states st = m->st;
states fresh = m->fresh;
states empty = m->empty;
states tmp = m->tmp;
const char *p = start;
wint_t c;
wint_t lastc; /* previous c */
wint_t flagch;
int i, sflags;
const char *matchp; /* last p at which a match ended */
size_t clen;
sflags = 0;
AT("slow", start, stop, startst, stopst);
CLEAR(st);
SET1(st, startst);
SP("sstart", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st, sflags);
if (fast)
ASSIGN(fresh, st);
matchp = NULL;
if (start == m->offp || (start == m->beginp && !(m->eflags&REG_NOTBOL)))
c = OUT;
else {
/*
* XXX Wrong if the previous character was multi-byte.
* Newline never is (in encodings supported by FreeBSD),
* so this only breaks the ISWORD tests below.
*/
c = (uch)*(start - 1);
}
for (;;) {
/* next character */
lastc = c;
sflags = 0;
if (p == m->endp) {
c = OUT;
clen = 0;
} else
clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
if (fast && EQ(st, fresh))
matchp = p;
/* is there an EOL and/or BOL between lastc and c? */
flagch = '\0';
i = 0;
if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
flagch = BOL;
i = m->g->nbol;
}
if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
flagch = (flagch == BOL) ? BOLEOL : EOL;
i += m->g->neol;
}
if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) {
sflags |= SBOS;
/* Step one more for BOS. */
i++;
}
if (c == OUT && (m->eflags & REG_NOTEOL) == 0) {
sflags |= SEOS;
/* Step one more for EOS. */
i++;
}
if (i != 0) {
for (; i > 0; i--)
st = step(m->g, startst, stopst, st, flagch, st,
sflags);
SP("sboleol", st, c);
}
/* how about a word boundary? */
if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
(c != OUT && ISWORD(c)) ) {
flagch = BOW;
}
if ( (lastc != OUT && ISWORD(lastc)) &&
(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
flagch = EOW;
}
if (flagch == BOW || flagch == EOW) {
st = step(m->g, startst, stopst, st, flagch, st, sflags);
SP("sboweow", st, c);
}
if (lastc != OUT && c != OUT &&
ISWORD(lastc) == ISWORD(c)) {
flagch = NWBND;
} else if ((lastc == OUT && !ISWORD(c)) ||
(c == OUT && !ISWORD(lastc))) {
flagch = NWBND;
}
if (flagch == NWBND) {
st = step(m->g, startst, stopst, st, flagch, st, sflags);
SP("snwbnd", st, c);
}
/* are we done? */
if (ISSET(st, stopst)) {
if (fast)
break;
else
matchp = p;
}
if (EQ(st, empty) || p == stop || clen > (size_t)(stop - p))
break; /* NOTE BREAK OUT */
/* no, we must deal with this character */
ASSIGN(tmp, st);
if (fast)
ASSIGN(st, fresh);
else
ASSIGN(st, empty);
assert(c != OUT);
st = step(m->g, startst, stopst, tmp, c, st, sflags);
SP("saft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags),
st));
p += clen;
}
if (fast) {
assert(matchp != NULL);
m->coldp = matchp;
if (ISSET(st, stopst))
return (p + XMBRTOWC(NULL, p, stop - p, &m->mbs, 0));
else
return (NULL);
} else
return (matchp);
}
/*
- step - map set of states reachable before char to set reachable after
== static states step(struct re_guts *g, sopno start, sopno stop, \
== states bef, int ch, states aft);
== #define BOL (OUT-1)
== #define EOL (BOL-1)
== #define BOLEOL (BOL-2)
== #define NOTHING (BOL-3)
== #define BOW (BOL-4)
== #define EOW (BOL-5)
== #define BADCHAR (BOL-6)
== #define NONCHAR(c) ((c) <= OUT)
*/
static states
step(struct re_guts *g,
sopno start, /* start state within strip */
sopno stop, /* state after stop state within strip */
states bef, /* states reachable before */
wint_t ch, /* character or NONCHAR code */
states aft, /* states already known reachable after */
int sflags) /* state flags */
{
cset *cs;
sop s;
sopno pc;
onestate here; /* note, macros know this name */
sopno look;
int i;
for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
s = g->strip[pc];
switch (OP(s)) {
case OEND:
assert(pc == stop-1);
break;
case OCHAR:
/* only characters can match */
assert(!NONCHAR(ch) || ch != OPND(s));
if (ch == OPND(s))
FWD(aft, bef, 1);
break;
case OBOS:
if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0)
FWD(aft, bef, 1);
break;
case OEOS:
if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0)
FWD(aft, bef, 1);
break;
case OBOL:
if (ch == BOL || ch == BOLEOL)
FWD(aft, bef, 1);
break;
case OEOL:
if (ch == EOL || ch == BOLEOL)
FWD(aft, bef, 1);
break;
case OBOW:
if (ch == BOW)
FWD(aft, bef, 1);
break;
case OEOW:
if (ch == EOW)
FWD(aft, bef, 1);
break;
case OWBND:
if (ch == BOW || ch == EOW)
FWD(aft, bef, 1);
break;
case ONWBND:
if (ch == NWBND)
FWD(aft, aft, 1);
break;
case OANY:
if (!NONCHAR(ch))
FWD(aft, bef, 1);
break;
case OANYOF:
cs = &g->sets[OPND(s)];
if (!NONCHAR(ch) && CHIN(cs, ch))
FWD(aft, bef, 1);
break;
case OBACK_: /* ignored here */
case O_BACK:
FWD(aft, aft, 1);
break;
case OPLUS_: /* forward, this is just an empty */
FWD(aft, aft, 1);
break;
case O_PLUS: /* both forward and back */
FWD(aft, aft, 1);
i = ISSETBACK(aft, OPND(s));
BACK(aft, aft, OPND(s));
if (!i && ISSETBACK(aft, OPND(s))) {
/* oho, must reconsider loop body */
pc -= OPND(s) + 1;
INIT(here, pc);
}
break;
case OQUEST_: /* two branches, both forward */
FWD(aft, aft, 1);
FWD(aft, aft, OPND(s));
break;
case O_QUEST: /* just an empty */
FWD(aft, aft, 1);
break;
case OLPAREN: /* not significant here */
case ORPAREN:
FWD(aft, aft, 1);
break;
case OCH_: /* mark the first two branches */
FWD(aft, aft, 1);
assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
FWD(aft, aft, OPND(s));
break;
case OOR1: /* done a branch, find the O_CH */
if (ISSTATEIN(aft, here)) {
for (look = 1;
OP(s = g->strip[pc+look]) != (sop)O_CH;
look += OPND(s))
assert(OP(s) == (sop)OOR2);
FWD(aft, aft, look + 1);
}
break;
case OOR2: /* propagate OCH_'s marking */
FWD(aft, aft, 1);
if (OP(g->strip[pc+OPND(s)]) != (sop)O_CH) {
assert(OP(g->strip[pc+OPND(s)]) == (sop)OOR2);
FWD(aft, aft, OPND(s));
}
break;
case O_CH: /* just empty */
FWD(aft, aft, 1);
break;
default: /* ooooops... */
assert(nope);
break;
}
}
return(aft);
}
#ifdef REDEBUG
/*
- print - print a set of states
== #ifdef REDEBUG
== static void print(struct match *m, const char *caption, states st, \
== int ch, FILE *d);
== #endif
*/
static void
print(struct match *m,
const char *caption,
states st,
int ch,
FILE *d)
{
struct re_guts *g = m->g;
sopno i;
int first = 1;
if (!(m->eflags&REG_TRACE))
return;
fprintf(d, "%s", caption);
if (ch != '\0')
fprintf(d, " %s", pchar(ch));
for (i = 0; i < g->nstates; i++)
if (ISSET(st, i)) {
fprintf(d, "%s%lu", (first) ? "\t" : ", ", i);
first = 0;
}
fprintf(d, "\n");
}
/*
- at - print current situation
== #ifdef REDEBUG
== static void at(struct match *m, const char *title, const char *start, \
== const char *stop, sopno startst, sopno stopst);
== #endif
*/
static void
at( struct match *m,
const char *title,
const char *start,
const char *stop,
sopno startst,
sopno stopst)
{
if (!(m->eflags&REG_TRACE))
return;
printf("%s %s-", title, pchar(*start));
printf("%s ", pchar(*stop));
printf("%ld-%ld\n", (long)startst, (long)stopst);
}
#ifndef PCHARDONE
#define PCHARDONE /* never again */
/*
- pchar - make a character printable
== #ifdef REDEBUG
== static const char *pchar(int ch);
== #endif
*
* Is this identical to regchar() over in debug.c? Well, yes. But a
* duplicate here avoids having a debugging-capable regexec.o tied to
* a matching debug.o, and this is convenient. It all disappears in
* the non-debug compilation anyway, so it doesn't matter much.
*/
static const char * /* -> representation */
pchar(int ch)
{
static char pbuf[10];
if (isprint((uch)ch) || ch == ' ')
sprintf(pbuf, "%c", ch);
else
sprintf(pbuf, "\\%o", ch);
return(pbuf);
}
#endif
#endif
#undef stepback
#undef matcher
#undef walk
#undef dissect
#undef backref
#undef step
#undef print
#undef at
#undef match