The impending libregex will implement GNU extensions to bring BREs and
EREs closer together. Prepare for this and reduce the diff of libregex changes by refactoring and combining the top-level parsers for EREs/BREs ahead of time. Branching functionality has been split out to make it easier to follow the combined version of the top-level parser. It may also be enabled in the parsing context to make it easier when libregex enables branching for BREs. A branching context was also added for the various branching functions and so that BREs, for instance, can determine if they're the first expression in a chain of expressions within the current branch and treat '*' as ordinary if so. This should have no functional impact and negligible performance impact. Reviewed by: cem, emaste, pfg Approved by: emaste (mentor) MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D10920
This commit is contained in:
parent
81e12dd187
commit
15ae9efabe
@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <regex.h>
|
||||
#include <stdbool.h>
|
||||
#include <wchar.h>
|
||||
#include <wctype.h>
|
||||
|
||||
@ -61,6 +62,24 @@ __FBSDID("$FreeBSD$");
|
||||
|
||||
#include "cname.h"
|
||||
|
||||
/*
|
||||
* Branching context, used to keep track of branch state for all of the branch-
|
||||
* aware functions. In addition to keeping track of branch positions for the
|
||||
* p_branch_* functions, we use this to simplify some clumsiness in BREs for
|
||||
* detection of whether ^ is acting as an anchor or being used erroneously and
|
||||
* also for whether we're in a sub-expression or not.
|
||||
*/
|
||||
struct branchc {
|
||||
sopno start;
|
||||
sopno back;
|
||||
sopno fwd;
|
||||
|
||||
int nbranch;
|
||||
int nchain;
|
||||
bool outer;
|
||||
bool terminate;
|
||||
};
|
||||
|
||||
/*
|
||||
* parse structure, passed up and down to avoid global variables and
|
||||
* other clumsinesses
|
||||
@ -77,6 +96,11 @@ struct parse {
|
||||
# define NPAREN 10 /* we need to remember () 1-9 for back refs */
|
||||
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
|
||||
sopno pend[NPAREN]; /* -> ) ([0] unused) */
|
||||
bool allowbranch; /* can this expression branch? */
|
||||
bool bre; /* convenience; is this a BRE? */
|
||||
bool (*parse_expr)(struct parse *, struct branchc *);
|
||||
void (*pre_parse)(struct parse *, struct branchc *);
|
||||
void (*post_parse)(struct parse *, struct branchc *);
|
||||
};
|
||||
|
||||
/* ========= begin header generated by ./mkh ========= */
|
||||
@ -85,11 +109,17 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
/* === regcomp.c === */
|
||||
static void p_ere(struct parse *p, int stop);
|
||||
static void p_ere_exp(struct parse *p);
|
||||
static bool p_ere_exp(struct parse *p, struct branchc *bc);
|
||||
static void p_str(struct parse *p);
|
||||
static void p_bre(struct parse *p, int end1, int end2);
|
||||
static int p_simp_re(struct parse *p, int starordinary);
|
||||
static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
|
||||
static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
|
||||
static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
|
||||
static void p_branch_empty(struct parse *p, struct branchc *bc);
|
||||
static bool p_branch_do(struct parse *p, struct branchc *bc);
|
||||
static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
|
||||
static void p_bre_post_parse(struct parse *p, struct branchc *bc);
|
||||
static void p_re(struct parse *p, int end1, int end2);
|
||||
static bool p_simp_re(struct parse *p, struct branchc *bc);
|
||||
static int p_count(struct parse *p);
|
||||
static void p_bracket(struct parse *p);
|
||||
static void p_b_term(struct parse *p, cset *cs);
|
||||
@ -139,6 +169,7 @@ static char nuls[10]; /* place to point scanner in event of error */
|
||||
#define MORE2() (p->next+1 < p->end)
|
||||
#define SEE(c) (MORE() && PEEK() == (c))
|
||||
#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
|
||||
#define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
|
||||
#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
|
||||
#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
|
||||
#define NEXT() (p->next++)
|
||||
@ -247,6 +278,19 @@ regcomp(regex_t * __restrict preg,
|
||||
p->pbegin[i] = 0;
|
||||
p->pend[i] = 0;
|
||||
}
|
||||
if (cflags & REG_EXTENDED) {
|
||||
p->allowbranch = true;
|
||||
p->bre = false;
|
||||
p->parse_expr = p_ere_exp;
|
||||
p->pre_parse = NULL;
|
||||
p->post_parse = NULL;
|
||||
} else {
|
||||
p->allowbranch = false;
|
||||
p->bre = true;
|
||||
p->parse_expr = p_simp_re;
|
||||
p->pre_parse = p_bre_pre_parse;
|
||||
p->post_parse = p_bre_post_parse;
|
||||
}
|
||||
g->sets = NULL;
|
||||
g->ncsets = 0;
|
||||
g->cflags = cflags;
|
||||
@ -264,12 +308,10 @@ regcomp(regex_t * __restrict preg,
|
||||
/* do it */
|
||||
EMIT(OEND, 0);
|
||||
g->firststate = THERE();
|
||||
if (cflags®_EXTENDED)
|
||||
p_ere(p, OUT);
|
||||
else if (cflags®_NOSPEC)
|
||||
if (cflags & REG_NOSPEC)
|
||||
p_str(p);
|
||||
else
|
||||
p_bre(p, OUT, OUT);
|
||||
p_re(p, OUT, OUT);
|
||||
EMIT(OEND, 0);
|
||||
g->laststate = THERE();
|
||||
|
||||
@ -305,56 +347,12 @@ regcomp(regex_t * __restrict preg,
|
||||
}
|
||||
|
||||
/*
|
||||
- p_ere - ERE parser top level, concatenation and alternation
|
||||
== static void p_ere(struct parse *p, int_t stop);
|
||||
- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
|
||||
- return whether we should terminate or not
|
||||
== static bool p_ere_exp(struct parse *p);
|
||||
*/
|
||||
static void
|
||||
p_ere(struct parse *p,
|
||||
int stop) /* character this ERE should end at */
|
||||
{
|
||||
char c;
|
||||
sopno prevback;
|
||||
sopno prevfwd;
|
||||
sopno conc;
|
||||
int first = 1; /* is this the first alternative? */
|
||||
|
||||
for (;;) {
|
||||
/* do a bunch of concatenated expressions */
|
||||
conc = HERE();
|
||||
while (MORE() && (c = PEEK()) != '|' && c != stop)
|
||||
p_ere_exp(p);
|
||||
(void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
|
||||
|
||||
if (!EAT('|'))
|
||||
break; /* NOTE BREAK OUT */
|
||||
|
||||
if (first) {
|
||||
INSERT(OCH_, conc); /* offset is wrong */
|
||||
prevfwd = conc;
|
||||
prevback = conc;
|
||||
first = 0;
|
||||
}
|
||||
ASTERN(OOR1, prevback);
|
||||
prevback = THERE();
|
||||
AHEAD(prevfwd); /* fix previous offset */
|
||||
prevfwd = HERE();
|
||||
EMIT(OOR2, 0); /* offset is very wrong */
|
||||
}
|
||||
|
||||
if (!first) { /* tail-end fixups */
|
||||
AHEAD(prevfwd);
|
||||
ASTERN(O_CH, prevback);
|
||||
}
|
||||
|
||||
assert(!MORE() || SEE(stop));
|
||||
}
|
||||
|
||||
/*
|
||||
- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
|
||||
== static void p_ere_exp(struct parse *p);
|
||||
*/
|
||||
static void
|
||||
p_ere_exp(struct parse *p)
|
||||
static bool
|
||||
p_ere_exp(struct parse *p, struct branchc *bc)
|
||||
{
|
||||
char c;
|
||||
wint_t wc;
|
||||
@ -377,7 +375,7 @@ p_ere_exp(struct parse *p)
|
||||
p->pbegin[subno] = HERE();
|
||||
EMIT(OLPAREN, subno);
|
||||
if (!SEE(')'))
|
||||
p_ere(p, ')');
|
||||
p_re(p, ')', IGN);
|
||||
if (subno < NPAREN) {
|
||||
p->pend[subno] = HERE();
|
||||
assert(p->pend[subno] != 0);
|
||||
@ -445,7 +443,7 @@ p_ere_exp(struct parse *p)
|
||||
/* FALLTHROUGH */
|
||||
default:
|
||||
if (p->error != 0)
|
||||
return;
|
||||
return (false);
|
||||
p->next--;
|
||||
wc = WGETNEXT();
|
||||
ordinary(p, wc);
|
||||
@ -453,12 +451,12 @@ p_ere_exp(struct parse *p)
|
||||
}
|
||||
|
||||
if (!MORE())
|
||||
return;
|
||||
return (false);
|
||||
c = PEEK();
|
||||
/* we call { a repetition if followed by a digit */
|
||||
if (!( c == '*' || c == '+' || c == '?' ||
|
||||
(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
|
||||
return; /* no repetition, we're done */
|
||||
return (false); /* no repetition, we're done */
|
||||
NEXT();
|
||||
|
||||
(void)REQUIRE(!wascaret, REG_BADRPT);
|
||||
@ -504,12 +502,13 @@ p_ere_exp(struct parse *p)
|
||||
}
|
||||
|
||||
if (!MORE())
|
||||
return;
|
||||
return (false);
|
||||
c = PEEK();
|
||||
if (!( c == '*' || c == '+' || c == '?' ||
|
||||
(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
|
||||
return;
|
||||
return (false);
|
||||
SETERROR(REG_BADRPT);
|
||||
return (false);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -525,9 +524,121 @@ p_str(struct parse *p)
|
||||
}
|
||||
|
||||
/*
|
||||
- p_bre - BRE parser top level, anchoring and concatenation
|
||||
== static void p_bre(struct parse *p, int end1, \
|
||||
== int end2);
|
||||
* Eat consecutive branch delimiters for the kind of expression that we are
|
||||
* parsing, return the number of delimiters that we ate.
|
||||
*/
|
||||
static int
|
||||
p_branch_eat_delim(struct parse *p, struct branchc *bc)
|
||||
{
|
||||
int nskip;
|
||||
|
||||
nskip = 0;
|
||||
while (EAT('|'))
|
||||
++nskip;
|
||||
return (nskip);
|
||||
}
|
||||
|
||||
/*
|
||||
* Insert necessary branch book-keeping operations. This emits a
|
||||
* bogus 'next' offset, since we still have more to parse
|
||||
*/
|
||||
static void
|
||||
p_branch_ins_offset(struct parse *p, struct branchc *bc)
|
||||
{
|
||||
|
||||
if (bc->nbranch == 0) {
|
||||
INSERT(OCH_, bc->start); /* offset is wrong */
|
||||
bc->fwd = bc->start;
|
||||
bc->back = bc->start;
|
||||
}
|
||||
|
||||
ASTERN(OOR1, bc->back);
|
||||
bc->back = THERE();
|
||||
AHEAD(bc->fwd); /* fix previous offset */
|
||||
bc->fwd = HERE();
|
||||
EMIT(OOR2, 0); /* offset is very wrong */
|
||||
++bc->nbranch;
|
||||
}
|
||||
|
||||
/*
|
||||
* Fix the offset of the tail branch, if we actually had any branches.
|
||||
* This is to correct the bogus placeholder offset that we use.
|
||||
*/
|
||||
static void
|
||||
p_branch_fix_tail(struct parse *p, struct branchc *bc)
|
||||
{
|
||||
|
||||
/* Fix bogus offset at the tail if we actually have branches */
|
||||
if (bc->nbranch > 0) {
|
||||
AHEAD(bc->fwd);
|
||||
ASTERN(O_CH, bc->back);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Signal to the parser that an empty branch has been encountered; this will,
|
||||
* in the future, be used to allow for more permissive behavior with empty
|
||||
* branches.
|
||||
*/
|
||||
static void
|
||||
p_branch_empty(struct parse *p, struct branchc *bc)
|
||||
{
|
||||
|
||||
SETERROR(REG_EMPTY);
|
||||
}
|
||||
|
||||
/*
|
||||
* Take care of any branching requirements. This includes inserting the
|
||||
* appropriate branching instructions as well as eating all of the branch
|
||||
* delimiters until we either run out of pattern or need to parse more pattern.
|
||||
*/
|
||||
static bool
|
||||
p_branch_do(struct parse *p, struct branchc *bc)
|
||||
{
|
||||
int ate = 0;
|
||||
|
||||
ate = p_branch_eat_delim(p, bc);
|
||||
if (ate == 0)
|
||||
return (false);
|
||||
(void)REQUIRE(ate == 1 && (!bc->outer || MORE()), REG_EMPTY);
|
||||
p_branch_ins_offset(p, bc);
|
||||
|
||||
return (true);
|
||||
}
|
||||
|
||||
static void
|
||||
p_bre_pre_parse(struct parse *p, struct branchc *bc)
|
||||
{
|
||||
|
||||
(void) bc;
|
||||
/*
|
||||
* Does not move cleanly into expression parser because of
|
||||
* ordinary interpration of * at the beginning position of
|
||||
* an expression.
|
||||
*/
|
||||
if (EAT('^')) {
|
||||
EMIT(OBOL, 0);
|
||||
p->g->iflags |= USEBOL;
|
||||
p->g->nbol++;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
p_bre_post_parse(struct parse *p, struct branchc *bc)
|
||||
{
|
||||
|
||||
/* Expression is terminating due to EOL token */
|
||||
if (bc->terminate) {
|
||||
DROP(1);
|
||||
EMIT(OEOL, 0);
|
||||
p->g->iflags |= USEEOL;
|
||||
p->g->neol++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
- p_re - Top level parser, concatenation and BRE anchoring
|
||||
== static void p_re(struct parse *p, int end1, int end2);
|
||||
* Giving end1 as OUT essentially eliminates the end1/end2 check.
|
||||
*
|
||||
* This implementation is a bit of a kludge, in that a trailing $ is first
|
||||
@ -535,40 +646,55 @@ p_str(struct parse *p)
|
||||
* The amount of lookahead needed to avoid this kludge is excessive.
|
||||
*/
|
||||
static void
|
||||
p_bre(struct parse *p,
|
||||
int end1, /* first terminating character */
|
||||
int end2) /* second terminating character */
|
||||
p_re(struct parse *p,
|
||||
int end1, /* first terminating character */
|
||||
int end2) /* second terminating character; ignored for EREs */
|
||||
{
|
||||
sopno start = HERE();
|
||||
int first = 1; /* first subexpression? */
|
||||
int wasdollar = 0;
|
||||
struct branchc bc;
|
||||
|
||||
if (EAT('^')) {
|
||||
EMIT(OBOL, 0);
|
||||
p->g->iflags |= USEBOL;
|
||||
p->g->nbol++;
|
||||
bc.nbranch = 0;
|
||||
if (end1 == OUT && end2 == OUT)
|
||||
bc.outer = true;
|
||||
else
|
||||
bc.outer = false;
|
||||
#define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
|
||||
for (;;) {
|
||||
bc.start = HERE();
|
||||
bc.nchain = 0;
|
||||
bc.terminate = false;
|
||||
if (p->pre_parse != NULL)
|
||||
p->pre_parse(p, &bc);
|
||||
while (MORE() && !SEESPEC('|') && !SEEEND()) {
|
||||
bc.terminate = p->parse_expr(p, &bc);
|
||||
++bc.nchain;
|
||||
}
|
||||
if (p->post_parse != NULL)
|
||||
p->post_parse(p, &bc);
|
||||
(void) REQUIRE(HERE() != bc.start, REG_EMPTY);
|
||||
if (!p->allowbranch)
|
||||
break;
|
||||
/*
|
||||
* p_branch_do's return value indicates whether we should
|
||||
* continue parsing or not. This is both for correctness and
|
||||
* a slight optimization, because it will check if we've
|
||||
* encountered an empty branch or the end of the string
|
||||
* immediately following a branch delimiter.
|
||||
*/
|
||||
if (!p_branch_do(p, &bc))
|
||||
break;
|
||||
}
|
||||
while (MORE() && !SEETWO(end1, end2)) {
|
||||
wasdollar = p_simp_re(p, first);
|
||||
first = 0;
|
||||
}
|
||||
if (wasdollar) { /* oops, that was a trailing anchor */
|
||||
DROP(1);
|
||||
EMIT(OEOL, 0);
|
||||
p->g->iflags |= USEEOL;
|
||||
p->g->neol++;
|
||||
}
|
||||
|
||||
(void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
|
||||
#undef SEE_END
|
||||
if (p->allowbranch)
|
||||
p_branch_fix_tail(p, &bc);
|
||||
assert(!MORE() || SEE(end1));
|
||||
}
|
||||
|
||||
/*
|
||||
- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
|
||||
== static int p_simp_re(struct parse *p, int starordinary);
|
||||
== static bool p_simp_re(struct parse *p, struct branchc *bc);
|
||||
*/
|
||||
static int /* was the simple RE an unbackslashed $? */
|
||||
p_simp_re(struct parse *p,
|
||||
int starordinary) /* is a leading * an ordinary character? */
|
||||
static bool /* was the simple RE an unbackslashed $? */
|
||||
p_simp_re(struct parse *p, struct branchc *bc)
|
||||
{
|
||||
int c;
|
||||
int count;
|
||||
@ -614,7 +740,7 @@ p_simp_re(struct parse *p,
|
||||
EMIT(OLPAREN, subno);
|
||||
/* the MORE here is an error heuristic */
|
||||
if (MORE() && !SEETWO('\\', ')'))
|
||||
p_bre(p, '\\', ')');
|
||||
p_re(p, '\\', ')');
|
||||
if (subno < NPAREN) {
|
||||
p->pend[subno] = HERE();
|
||||
assert(p->pend[subno] != 0);
|
||||
@ -650,11 +776,16 @@ p_simp_re(struct parse *p,
|
||||
p->g->backrefs = 1;
|
||||
break;
|
||||
case '*':
|
||||
(void)REQUIRE(starordinary, REG_BADRPT);
|
||||
/*
|
||||
* Ordinary if used as the first character beyond BOL anchor of
|
||||
* a (sub-)expression, counts as a bad repetition operator if it
|
||||
* appears otherwise.
|
||||
*/
|
||||
(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
|
||||
/* FALLTHROUGH */
|
||||
default:
|
||||
if (p->error != 0)
|
||||
return(0); /* Definitely not $... */
|
||||
return (false); /* Definitely not $... */
|
||||
p->next--;
|
||||
wc = WGETNEXT();
|
||||
ordinary(p, wc);
|
||||
@ -685,9 +816,9 @@ p_simp_re(struct parse *p,
|
||||
SETERROR(REG_BADBR);
|
||||
}
|
||||
} else if (c == '$') /* $ (but not \$) ends it */
|
||||
return(1);
|
||||
return (true);
|
||||
|
||||
return(0);
|
||||
return (false);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -189,4 +189,5 @@ struct re_guts {
|
||||
|
||||
/* misc utilities */
|
||||
#define OUT (CHAR_MIN - 1) /* a non-character value */
|
||||
#define IGN (CHAR_MIN - 2)
|
||||
#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_')
|
||||
|
Loading…
x
Reference in New Issue
Block a user