Enhance the optimization provided by pre-matching. Fix style bugs with

previous commits. At the time we search the pattern for the "must" string, we now compute the longest offset from the beginning of the pattern at which the must string might be found. If that offset is found to be infinite (through use of "+" or "*"), we set it to -1 to disable the heuristics applied later. After we are done with pre-matching, we use that offset and the point in the text at which the must string was found to compute the earliest point at which the pattern might be found. Special care should be taken here. The variable "start" is passed to the automata-processing functions fast() and slow() to indicate the point in the text at which they should start working from. The real beginning of the text is passed in a struct match variable m, which is used to check for anchors. That variable, though, is initialized with "start", so we must not adjust "start" before "m" is properly initialized. Simple tests showed a speed increase from 100% to 400%, but they were biased in that regexec() was called for the whole file instead of line by line, and parenthized subexpressions were not searched for. This change adds a single integer to the size of the "guts" structure, and does not change the ABI. Further improvements possible: Since the speed increase observed here is so huge, one intuitive optimization would be to introduce a bias in the function that computes the "must" string so as to prefer a smaller string with a finite offset over a larger one with an infinite offset. Tests have shown this to be a bad idea, though, as the cost of false pre-matches far outweights the benefits of a must offset, even in biased situations. A number of other improvements suggest themselves, though: * identify the cases where the pattern is identical to the must string, and avoid entering fast() and slow() in these cases. * compute the maximum offset from the must string to the end of the pattern, and use that to set the point at which fast() and slow() should give up trying to find a match, and return then return to pre-matching. * return all the way to pre-matching if a "match" was found and later invalidated by back reference processing. Since back references are evil and should be avoided anyway, this is of little use.
2000-07-02 10:58:07 +00:00 · 2000-07-02 10:58:07 +00:00 · 6976417416
commit 6976417416
parent fa6b79f1ea
3 changed files with 177 additions and 12 deletions
--- a/lib/libc/regex/engine.c
+++ b/lib/libc/regex/engine.c
@ -180,7 +180,7 @@ int eflags;

 	/* prescreening; this does wonders for this rather slow code */
 	if (g->must != NULL) {
-		if(g->charjump != NULL && g->matchjump != NULL) {
+		if (g->charjump != NULL && g->matchjump != NULL) {
 			mustlen = -g->mlen;
 			mustfirst = g->must;
 			mustlast = g->must + g->mlen - 1;
@ -188,21 +188,21 @@ int eflags;
 			matchjump = g->matchjump;
 			bmps = stop;
 			pp = mustlast;
-			for(bmp = start+g->mlen-1; bmp < bmps;) {
+			for (bmp = start+g->mlen-1; bmp < bmps;) {
 				/* Fast skip non-matches */
-				while(bmp < bmps && charjump[*bmp])
+				while (bmp < bmps && charjump[*bmp])
 					bmp += charjump[*bmp];

-				if(bmp >= bmps)
+				if (bmp >= bmps)
 					break;

 				/* Greedy matcher */
 				/* We depend on not being used for
 				 * for strings of length 1
 				 */
-				while(*--bmp == *--pp && pp != mustfirst);
+				while (*--bmp == *--pp && pp != mustfirst);

-				if(*bmp == *pp)
+				if (*bmp == *pp)
 					break;

 				/* Jump to next possible match */
@ -211,8 +211,9 @@ int eflags;
 				bmp += (cj < mj ? mj : cj);
 				pp = mustlast;				
 			}
-			if(pp != mustfirst)
+			if (pp != mustfirst)
 				return(REG_NOMATCH);
+			dp = bmp;
 		} else {
 			for (dp = start; dp < stop; dp++)
 				if (*dp == g->must[0] &&
@ -239,6 +240,10 @@ int eflags;
 	SETUP(m->empty);
 	CLEAR(m->empty);

+	/* Adjust start according to moffset, to speed things up */
+	if (g->moffset > -1)
+		start = dp - g->moffset;
+
 	/* this loop does only one repetition except for backrefs */
 	for (;;) {
 		endp = fast(m, start, stop, gf, gl);
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@ -124,6 +124,7 @@ static void dofwd __P((struct parse *p, sopno pos, sop value));
 static void enlarge __P((struct parse *p, sopno size));
 static void stripsnug __P((struct parse *p, struct re_guts *g));
 static void findmust __P((struct parse *p, struct re_guts *g));
+static int altoffset __P((sop *scan, int offset, int mccs));
 static void computejumps __P((struct parse *p, struct re_guts *g));
 static void computematchjumps __P((struct parse *p, struct re_guts *g));
 static sopno pluscount __P((struct parse *p, struct re_guts *g));
@ -246,6 +247,7 @@ int cflags;
 	g->nbol = 0;
 	g->neol = 0;
 	g->must = NULL;
+	g->moffset = -1;
 	g->charjump = NULL;
 	g->matchjump = NULL;
 	g->mlen = 0;
@ -1695,13 +1697,23 @@ register struct re_guts *g;
 	register sop s;
 	register char *cp;
 	register sopno i;
+	int offset;
+	int cs, mccs;

 	/* avoid making error situations worse */
 	if (p->error != 0)
 		return;

+	/* Find out if we can handle OANYOF or not */
+	mccs = 0;
+	for (cs = 0; cs < g->ncsets; cs++)
+		if (g->sets[cs].multis != NULL)
+			mccs = 1;
+
 	/* find the longest OCHAR sequence in strip */
 	newlen = 0;
+	offset = 0;
+	g->moffset = 0;
 	scan = g->strip + 1;
 	do {
 		s = *scan++;
@ -1717,6 +1729,7 @@ register struct re_guts *g;
 			break;
 		case OQUEST_:		/* things that must be skipped */
 		case OCH_:
+			offset = altoffset(scan, offset, mccs);
 			scan--;
 			do {
 				scan += OPND(s);
@ -1729,23 +1742,97 @@ register struct re_guts *g;
 				}
 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
 			/* fallthrough */
-		default:		/* things that break a sequence */
+		case OBOW:		/* things that break a sequence */
+		case OEOW:
+		case OBOL:
+		case OEOL:
+		case O_QUEST:
+		case O_CH:
+		case OEND:
 			if (newlen > g->mlen) {		/* ends one */
 				start = newstart;
 				g->mlen = newlen;
+				if (offset > -1) {
+					g->moffset += offset;
+					offset = newlen;
+				} else
+					g->moffset = offset;
+			} else {
+				if (offset > -1)
+					offset += newlen;
 			}
 			newlen = 0;
 			break;
+		case OANY:
+			if (newlen > g->mlen) {		/* ends one */
+				start = newstart;
+				g->mlen = newlen;
+				if (offset > -1) {
+					g->moffset += offset;
+					offset = newlen;
+				} else
+					g->moffset = offset;
+			} else {
+				if (offset > -1)
+					offset += newlen;
+			}
+			if (offset > -1)
+				offset++;
+			newlen = 0;
+			break;
+		case OANYOF:		/* may or may not invalidate offset */
+			/* First, everything as OANY */
+			if (newlen > g->mlen) {		/* ends one */
+				start = newstart;
+				g->mlen = newlen;
+				if (offset > -1) {
+					g->moffset += offset;
+					offset = newlen;
+				} else
+					g->moffset = offset;
+			} else {
+				if (offset > -1)
+					offset += newlen;
+			}
+			if (offset > -1)
+				offset++;
+			newlen = 0;
+			/* And, now, if we found out we can't deal with
+			 * it, make offset = -1.
+			 */
+			if (mccs)
+				offset = -1;
+			break;
+		default:
+			/* Anything here makes it impossible or too hard
+			 * to calculate the offset -- so we give up;
+			 * save the last known good offset, in case the
+			 * must sequence doesn't occur later.
+			 */
+			if (newlen > g->mlen) {		/* ends one */
+				start = newstart;
+				g->mlen = newlen;
+				if (offset > -1)
+					g->moffset += offset;
+				else
+					g->moffset = offset;
+			}
+			offset = -1;
+			newlen = 0;
+			break;
 		}
 	} while (OP(s) != OEND);

-	if (g->mlen == 0)		/* there isn't one */
+	if (g->mlen == 0) {		/* there isn't one */
+		g->moffset = -1;
 		return;
+	}

 	/* turn it into a character string */
 	g->must = malloc((size_t)g->mlen + 1);
 	if (g->must == NULL) {		/* argh; just forget it */
 		g->mlen = 0;
+		g->moffset = -1;
 		return;
 	}
 	cp = g->must;
@ -1760,6 +1847,78 @@ register struct re_guts *g;
 	*cp++ = '\0';		/* just on general principles */
 }

+/*
+ - altoffset - choose biggest offset among multiple choices
+ = static int altoffset(sop *scan, int offset, int mccs);
+ *
+ * Compute, recursively if necessary, the largest offset among multiple
+ * re paths.
+ */
+static int
+altoffset(scan, offset, mccs)
+sop *scan;
+int offset;
+int mccs;
+{
+	int largest;
+	int try;
+	sop s;
+
+	/* If we gave up already on offsets, return */
+	if (offset == -1)
+		return -1;
+
+	largest = 0;
+	try = 0;
+	s = *scan++;
+	while (OP(s) != O_QUEST && OP(s) != O_CH) {
+		switch (OP(s)) {
+		case OOR1:
+			if (try > largest)
+				largest = try;
+			try = 0;
+			break;
+		case OQUEST_:
+		case OCH_:
+			try = altoffset(scan, try, mccs);
+			if (try == -1)
+				return -1;
+			scan--;
+			do {
+				scan += OPND(s);
+				s = *scan;
+				if (OP(s) != O_QUEST && OP(s) != O_CH &&
+							OP(s) != OOR2)
+					return -1;
+			} while (OP(s) != O_QUEST && OP(s) != O_CH);
+			break;
+		case OANYOF:
+			if (mccs)
+				return -1;
+		case OCHAR:
+		case OANY:
+			try++;
+		case OBOW:
+		case OEOW:
+		case OLPAREN:
+		case ORPAREN:
+		case OOR2:
+			break;
+		default:
+			try = -1;
+			break;
+		}
+		if (try == -1)
+			return -1;
+		s = *scan++;
+	}
+
+	if (try > largest)
+		largest = try;
+
+	return largest+offset;
+}
+
 /*
 - computejumps - compute char jumps for BM scan
 == static void computejumps(register struct parse *p, register struct re_guts *g);
@ -1783,7 +1942,7 @@ struct re_guts *g;
 	if (p->error != 0)
 		return;

-	g->charjump = malloc(256 * sizeof(int));
+	g->charjump = malloc((UCHAR_MAX+1) * sizeof(int));
 	if (g->charjump == NULL)	/* Not a fatal error */
 		return;

@ -1874,8 +2033,8 @@ struct re_guts *g;
 		    g->mlen + suffix - mindex);

        ssuffix = pmatches[suffix];
-        while(suffix < g->mlen) {
-                while(suffix <= ssuffix) {
+        while (suffix < g->mlen) {
+                while (suffix <= ssuffix) {
                        g->matchjump[suffix] = MIN(g->matchjump[suffix],
 			    g->mlen + ssuffix - suffix);
                        suffix++;
--- a/lib/libc/regex/regex2.h
+++ b/lib/libc/regex/regex2.h
@ -162,6 +162,7 @@ struct re_guts {
 	int ncategories;	/* how many character categories */
 	cat_t *categories;	/* ->catspace[-CHAR_MIN] */
 	char *must;		/* match must contain this string */
+	int moffset;		/* latest point at which must may be located */
 	int *charjump;		/* Boyer-Moore char jump table */
 	int *matchjump;		/* Boyer-Moore match jump table */
 	int mlen;		/* length of must */