The impending libregex will implement GNU extensions to bring BREs and

EREs closer together. Prepare for this and reduce the diff of libregex changes by refactoring and combining the top-level parsers for EREs/BREs ahead of time. Branching functionality has been split out to make it easier to follow the combined version of the top-level parser. It may also be enabled in the parsing context to make it easier when libregex enables branching for BREs. A branching context was also added for the various branching functions and so that BREs, for instance, can determine if they're the first expression in a chain of expressions within the current branch and treat '*' as ordinary if so. This should have no functional impact and negligible performance impact. Reviewed by: cem, emaste, pfg Approved by: emaste (mentor) MFC after: 1 month Differential Revision: https://reviews.freebsd.org/D10920
2017-07-06 17:01:51 +00:00 · 2017-07-06 17:01:51 +00:00 · 15ae9efabe
commit 15ae9efabe
parent 81e12dd187
2 changed files with 229 additions and 97 deletions
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <limits.h>
 #include <stdlib.h>
 #include <regex.h>
+#include <stdbool.h>
 #include <wchar.h>
 #include <wctype.h>

@ -61,6 +62,24 @@ __FBSDID("$FreeBSD$");

 #include "cname.h"

+/*
+ * Branching context, used to keep track of branch state for all of the branch-
+ * aware functions. In addition to keeping track of branch positions for the
+ * p_branch_* functions, we use this to simplify some clumsiness in BREs for
+ * detection of whether ^ is acting as an anchor or being used erroneously and
+ * also for whether we're in a sub-expression or not.
+ */
+struct branchc {
+	sopno start;
+	sopno back;
+	sopno fwd;
+
+	int nbranch;
+	int nchain;
+	bool outer;
+	bool terminate;
+};
+
 /*
 * parse structure, passed up and down to avoid global variables and
 * other clumsinesses
@ -77,6 +96,11 @@ struct parse {
 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
+	bool allowbranch;	/* can this expression branch? */
+	bool bre;		/* convenience; is this a BRE? */
+	bool (*parse_expr)(struct parse *, struct branchc *);
+	void (*pre_parse)(struct parse *, struct branchc *);
+	void (*post_parse)(struct parse *, struct branchc *);
 };

 /* ========= begin header generated by ./mkh ========= */
@ -85,11 +109,17 @@ extern "C" {
 #endif

 /* === regcomp.c === */
-static void p_ere(struct parse *p, int stop);
-static void p_ere_exp(struct parse *p);
+static bool p_ere_exp(struct parse *p, struct branchc *bc);
 static void p_str(struct parse *p);
-static void p_bre(struct parse *p, int end1, int end2);
-static int p_simp_re(struct parse *p, int starordinary);
+static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
+static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
+static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
+static void p_branch_empty(struct parse *p, struct branchc *bc);
+static bool p_branch_do(struct parse *p, struct branchc *bc);
+static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
+static void p_bre_post_parse(struct parse *p, struct branchc *bc);
+static void p_re(struct parse *p, int end1, int end2);
+static bool p_simp_re(struct parse *p, struct branchc *bc);
 static int p_count(struct parse *p);
 static void p_bracket(struct parse *p);
 static void p_b_term(struct parse *p, cset *cs);
@ -139,6 +169,7 @@ static char nuls[10];		/* place to point scanner in event of error */
 #define	MORE2()	(p->next+1 < p->end)
 #define	SEE(c)	(MORE() && PEEK() == (c))
 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
+#define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
 #define	NEXT()	(p->next++)
@ -247,6 +278,19 @@ regcomp(regex_t * __restrict preg,
 		p->pbegin[i] = 0;
 		p->pend[i] = 0;
 	}
+	if (cflags & REG_EXTENDED) {
+		p->allowbranch = true;
+		p->bre = false;
+		p->parse_expr = p_ere_exp;
+		p->pre_parse = NULL;
+		p->post_parse = NULL;
+	} else {
+		p->allowbranch = false;
+		p->bre = true;
+		p->parse_expr = p_simp_re;
+		p->pre_parse = p_bre_pre_parse;
+		p->post_parse = p_bre_post_parse;
+	}
 	g->sets = NULL;
 	g->ncsets = 0;
 	g->cflags = cflags;
@ -264,12 +308,10 @@ regcomp(regex_t * __restrict preg,
 	/* do it */
 	EMIT(OEND, 0);
 	g->firststate = THERE();
-	if (cflags&REG_EXTENDED)
-		p_ere(p, OUT);
-	else if (cflags&REG_NOSPEC)
+	if (cflags & REG_NOSPEC)
 		p_str(p);
 	else
-		p_bre(p, OUT, OUT);
+		p_re(p, OUT, OUT);
 	EMIT(OEND, 0);
 	g->laststate = THERE();

@ -305,56 +347,12 @@ regcomp(regex_t * __restrict preg,
 }

 /*
- - p_ere - ERE parser top level, concatenation and alternation
- == static void p_ere(struct parse *p, int_t stop);
+ - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
+ - return whether we should terminate or not
+ == static bool p_ere_exp(struct parse *p);
 */
-static void
-p_ere(struct parse *p,
-	int stop)		/* character this ERE should end at */
-{
-	char c;
-	sopno prevback;
-	sopno prevfwd;
-	sopno conc;
-	int first = 1;		/* is this the first alternative? */
-
-	for (;;) {
-		/* do a bunch of concatenated expressions */
-		conc = HERE();
-		while (MORE() && (c = PEEK()) != '|' && c != stop)
-			p_ere_exp(p);
-		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
-
-		if (!EAT('|'))
-			break;		/* NOTE BREAK OUT */
-
-		if (first) {
-			INSERT(OCH_, conc);	/* offset is wrong */
-			prevfwd = conc;
-			prevback = conc;
-			first = 0;
-		}
-		ASTERN(OOR1, prevback);
-		prevback = THERE();
-		AHEAD(prevfwd);			/* fix previous offset */
-		prevfwd = HERE();
-		EMIT(OOR2, 0);			/* offset is very wrong */
-	}
-
-	if (!first) {		/* tail-end fixups */
-		AHEAD(prevfwd);
-		ASTERN(O_CH, prevback);
-	}
-
-	assert(!MORE() || SEE(stop));
-}
-
-/*
- - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
- == static void p_ere_exp(struct parse *p);
- */
-static void
-p_ere_exp(struct parse *p)
+static bool
+p_ere_exp(struct parse *p, struct branchc *bc)
 {
 	char c;
 	wint_t wc;
@ -377,7 +375,7 @@ p_ere_exp(struct parse *p)
 			p->pbegin[subno] = HERE();
 		EMIT(OLPAREN, subno);
 		if (!SEE(')'))
-			p_ere(p, ')');
+			p_re(p, ')', IGN);
 		if (subno < NPAREN) {
 			p->pend[subno] = HERE();
 			assert(p->pend[subno] != 0);
@ -445,7 +443,7 @@ p_ere_exp(struct parse *p)
 		/* FALLTHROUGH */
 	default:
 		if (p->error != 0)
-			return;
+			return (false);
 		p->next--;
 		wc = WGETNEXT();
 		ordinary(p, wc);
@ -453,12 +451,12 @@ p_ere_exp(struct parse *p)
 	}

 	if (!MORE())
-		return;
+		return (false);
 	c = PEEK();
 	/* we call { a repetition if followed by a digit */
 	if (!( c == '*' || c == '+' || c == '?' ||
 				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
-		return;		/* no repetition, we're done */
+		return (false);		/* no repetition, we're done */
 	NEXT();

 	(void)REQUIRE(!wascaret, REG_BADRPT);
@ -504,12 +502,13 @@ p_ere_exp(struct parse *p)
 	}

 	if (!MORE())
-		return;
+		return (false);
 	c = PEEK();
 	if (!( c == '*' || c == '+' || c == '?' ||
 				(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
-		return;
+		return (false);
 	SETERROR(REG_BADRPT);
+	return (false);
 }

 /*
@ -525,9 +524,121 @@ p_str(struct parse *p)
 }

 /*
- - p_bre - BRE parser top level, anchoring and concatenation
- == static void p_bre(struct parse *p,  int end1, \
- ==	int end2);
+ * Eat consecutive branch delimiters for the kind of expression that we are
+ * parsing, return the number of delimiters that we ate.
+ */
+static int
+p_branch_eat_delim(struct parse *p, struct branchc *bc)
+{
+	int nskip;
+
+	nskip = 0;
+	while (EAT('|'))
+		++nskip;
+	return (nskip);
+}
+
+/*
+ * Insert necessary branch book-keeping operations. This emits a
+ * bogus 'next' offset, since we still have more to parse
+ */
+static void
+p_branch_ins_offset(struct parse *p, struct branchc *bc)
+{
+
+	if (bc->nbranch == 0) {
+		INSERT(OCH_, bc->start);	/* offset is wrong */
+		bc->fwd = bc->start;
+		bc->back = bc->start;
+	}
+
+	ASTERN(OOR1, bc->back);
+	bc->back = THERE();
+	AHEAD(bc->fwd);			/* fix previous offset */
+	bc->fwd = HERE();
+	EMIT(OOR2, 0);			/* offset is very wrong */
+	++bc->nbranch;
+}
+
+/*
+ * Fix the offset of the tail branch, if we actually had any branches.
+ * This is to correct the bogus placeholder offset that we use.
+ */
+static void
+p_branch_fix_tail(struct parse *p, struct branchc *bc)
+{
+
+	/* Fix bogus offset at the tail if we actually have branches */
+	if (bc->nbranch > 0) {
+		AHEAD(bc->fwd);
+		ASTERN(O_CH, bc->back);
+	}
+}
+
+/*
+ * Signal to the parser that an empty branch has been encountered; this will,
+ * in the future, be used to allow for more permissive behavior with empty
+ * branches.
+ */
+static void
+p_branch_empty(struct parse *p, struct branchc *bc)
+{
+
+	SETERROR(REG_EMPTY);
+}
+
+/*
+ * Take care of any branching requirements. This includes inserting the
+ * appropriate branching instructions as well as eating all of the branch
+ * delimiters until we either run out of pattern or need to parse more pattern.
+ */
+static bool
+p_branch_do(struct parse *p, struct branchc *bc)
+{
+	int ate = 0;
+
+	ate = p_branch_eat_delim(p, bc);
+	if (ate == 0)
+		return (false);
+	(void)REQUIRE(ate == 1 && (!bc->outer || MORE()), REG_EMPTY);
+	p_branch_ins_offset(p, bc);
+
+	return (true);
+}
+
+static void
+p_bre_pre_parse(struct parse *p, struct branchc *bc)
+{
+
+	(void) bc;
+	/*
+	 * Does not move cleanly into expression parser because of
+	 * ordinary interpration of * at the beginning position of
+	 * an expression.
+	 */
+	if (EAT('^')) {
+		EMIT(OBOL, 0);
+		p->g->iflags |= USEBOL;
+		p->g->nbol++;
+	}
+}
+
+static void
+p_bre_post_parse(struct parse *p, struct branchc *bc)
+{
+
+	/* Expression is terminating due to EOL token */
+	if (bc->terminate) {
+		DROP(1);
+		EMIT(OEOL, 0);
+		p->g->iflags |= USEEOL;
+		p->g->neol++;
+	}
+}
+
+/*
+ - p_re - Top level parser, concatenation and BRE anchoring
+ == static void p_re(struct parse *p, int end1, int end2);
 * Giving end1 as OUT essentially eliminates the end1/end2 check.
 *
 * This implementation is a bit of a kludge, in that a trailing $ is first
@ -535,40 +646,55 @@ p_str(struct parse *p)
 * The amount of lookahead needed to avoid this kludge is excessive.
 */
 static void
-p_bre(struct parse *p,
-	int end1,		/* first terminating character */
-	int end2)		/* second terminating character */
+p_re(struct parse *p,
+	int end1,	/* first terminating character */
+	int end2)	/* second terminating character; ignored for EREs */
 {
-	sopno start = HERE();
-	int first = 1;			/* first subexpression? */
-	int wasdollar = 0;
+	struct branchc bc;

-	if (EAT('^')) {
-		EMIT(OBOL, 0);
-		p->g->iflags |= USEBOL;
-		p->g->nbol++;
+	bc.nbranch = 0;
+	if (end1 == OUT && end2 == OUT)
+		bc.outer = true;
+	else
+		bc.outer = false;
+#define	SEEEND()	(!p->bre ? SEE(end1) : SEETWO(end1, end2))
+	for (;;) {
+		bc.start = HERE();
+		bc.nchain = 0;
+		bc.terminate = false;
+		if (p->pre_parse != NULL)
+			p->pre_parse(p, &bc);
+		while (MORE() && !SEESPEC('|') && !SEEEND()) {
+			bc.terminate = p->parse_expr(p, &bc);
+			++bc.nchain;
+		}
+		if (p->post_parse != NULL)
+			p->post_parse(p, &bc);
+		(void) REQUIRE(HERE() != bc.start, REG_EMPTY);
+		if (!p->allowbranch)
+			break;
+		/*
+		 * p_branch_do's return value indicates whether we should
+		 * continue parsing or not. This is both for correctness and
+		 * a slight optimization, because it will check if we've
+		 * encountered an empty branch or the end of the string
+		 * immediately following a branch delimiter.
+		 */
+		if (!p_branch_do(p, &bc))
+			break;
 	}
-	while (MORE() && !SEETWO(end1, end2)) {
-		wasdollar = p_simp_re(p, first);
-		first = 0;
-	}
-	if (wasdollar) {	/* oops, that was a trailing anchor */
-		DROP(1);
-		EMIT(OEOL, 0);
-		p->g->iflags |= USEEOL;
-		p->g->neol++;
-	}
-
-	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
+#undef SEE_END
+	if (p->allowbranch)
+		p_branch_fix_tail(p, &bc);
+	assert(!MORE() || SEE(end1));
 }

 /*
 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
- == static int p_simp_re(struct parse *p, int starordinary);
+ == static bool p_simp_re(struct parse *p, struct branchc *bc);
 */
-static int			/* was the simple RE an unbackslashed $? */
-p_simp_re(struct parse *p,
-	int starordinary)	/* is a leading * an ordinary character? */
+static bool			/* was the simple RE an unbackslashed $? */
+p_simp_re(struct parse *p, struct branchc *bc)
 {
 	int c;
 	int count;
@ -614,7 +740,7 @@ p_simp_re(struct parse *p,
 		EMIT(OLPAREN, subno);
 		/* the MORE here is an error heuristic */
 		if (MORE() && !SEETWO('\\', ')'))
-			p_bre(p, '\\', ')');
+			p_re(p, '\\', ')');
 		if (subno < NPAREN) {
 			p->pend[subno] = HERE();
 			assert(p->pend[subno] != 0);
@ -650,11 +776,16 @@ p_simp_re(struct parse *p,
 		p->g->backrefs = 1;
 		break;
 	case '*':
-		(void)REQUIRE(starordinary, REG_BADRPT);
+		/*
+		 * Ordinary if used as the first character beyond BOL anchor of
+		 * a (sub-)expression, counts as a bad repetition operator if it
+		 * appears otherwise.
+		 */
+		(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
 		/* FALLTHROUGH */
 	default:
 		if (p->error != 0)
-			return(0);	/* Definitely not $... */
+			return (false);	/* Definitely not $... */
 		p->next--;
 		wc = WGETNEXT();
 		ordinary(p, wc);
@ -685,9 +816,9 @@ p_simp_re(struct parse *p,
 			SETERROR(REG_BADBR);
 		}
 	} else if (c == '$')     /* $ (but not \$) ends it */
-		return(1);
+		return (true);

-	return(0);
+	return (false);
 }

 /*
--- a/lib/libc/regex/regex2.h
+++ b/lib/libc/regex/regex2.h
@ -189,4 +189,5 @@ struct re_guts {

 /* misc utilities */
 #define	OUT	(CHAR_MIN - 1)	/* a non-character value */
+#define	IGN	(CHAR_MIN - 2)
 #define ISWORD(c)       (iswalnum((uch)(c)) || (c) == '_')