Enhance the optimization provided by pre-matching. Fix style bugs with

previous commits.

At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.

After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.

Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.

Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.

This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.

Further improvements possible:

Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.

A number of other improvements suggest themselves, though:

	* identify the cases where the pattern is identical to the must
	string, and avoid entering fast() and slow() in these cases.

	* compute the maximum offset from the must string to the end of
	the pattern, and use that to set the point at which fast() and
	slow() should give up trying to find a match, and return then
	return to pre-matching.

	* return all the way to pre-matching if a "match" was found and
	later invalidated by back reference processing. Since back
	references are evil and should be avoided anyway, this is of
	little use.
This commit is contained in:
dcs 2000-07-02 10:58:07 +00:00
parent fa6b79f1ea
commit 6976417416
3 changed files with 177 additions and 12 deletions

View File

@ -180,7 +180,7 @@ int eflags;
/* prescreening; this does wonders for this rather slow code */
if (g->must != NULL) {
if(g->charjump != NULL && g->matchjump != NULL) {
if (g->charjump != NULL && g->matchjump != NULL) {
mustlen = -g->mlen;
mustfirst = g->must;
mustlast = g->must + g->mlen - 1;
@ -188,21 +188,21 @@ int eflags;
matchjump = g->matchjump;
bmps = stop;
pp = mustlast;
for(bmp = start+g->mlen-1; bmp < bmps;) {
for (bmp = start+g->mlen-1; bmp < bmps;) {
/* Fast skip non-matches */
while(bmp < bmps && charjump[*bmp])
while (bmp < bmps && charjump[*bmp])
bmp += charjump[*bmp];
if(bmp >= bmps)
if (bmp >= bmps)
break;
/* Greedy matcher */
/* We depend on not being used for
* for strings of length 1
*/
while(*--bmp == *--pp && pp != mustfirst);
while (*--bmp == *--pp && pp != mustfirst);
if(*bmp == *pp)
if (*bmp == *pp)
break;
/* Jump to next possible match */
@ -211,8 +211,9 @@ int eflags;
bmp += (cj < mj ? mj : cj);
pp = mustlast;
}
if(pp != mustfirst)
if (pp != mustfirst)
return(REG_NOMATCH);
dp = bmp;
} else {
for (dp = start; dp < stop; dp++)
if (*dp == g->must[0] &&
@ -239,6 +240,10 @@ int eflags;
SETUP(m->empty);
CLEAR(m->empty);
/* Adjust start according to moffset, to speed things up */
if (g->moffset > -1)
start = dp - g->moffset;
/* this loop does only one repetition except for backrefs */
for (;;) {
endp = fast(m, start, stop, gf, gl);

View File

@ -124,6 +124,7 @@ static void dofwd __P((struct parse *p, sopno pos, sop value));
static void enlarge __P((struct parse *p, sopno size));
static void stripsnug __P((struct parse *p, struct re_guts *g));
static void findmust __P((struct parse *p, struct re_guts *g));
static int altoffset __P((sop *scan, int offset, int mccs));
static void computejumps __P((struct parse *p, struct re_guts *g));
static void computematchjumps __P((struct parse *p, struct re_guts *g));
static sopno pluscount __P((struct parse *p, struct re_guts *g));
@ -246,6 +247,7 @@ int cflags;
g->nbol = 0;
g->neol = 0;
g->must = NULL;
g->moffset = -1;
g->charjump = NULL;
g->matchjump = NULL;
g->mlen = 0;
@ -1695,13 +1697,23 @@ register struct re_guts *g;
register sop s;
register char *cp;
register sopno i;
int offset;
int cs, mccs;
/* avoid making error situations worse */
if (p->error != 0)
return;
/* Find out if we can handle OANYOF or not */
mccs = 0;
for (cs = 0; cs < g->ncsets; cs++)
if (g->sets[cs].multis != NULL)
mccs = 1;
/* find the longest OCHAR sequence in strip */
newlen = 0;
offset = 0;
g->moffset = 0;
scan = g->strip + 1;
do {
s = *scan++;
@ -1717,6 +1729,7 @@ register struct re_guts *g;
break;
case OQUEST_: /* things that must be skipped */
case OCH_:
offset = altoffset(scan, offset, mccs);
scan--;
do {
scan += OPND(s);
@ -1729,23 +1742,97 @@ register struct re_guts *g;
}
} while (OP(s) != O_QUEST && OP(s) != O_CH);
/* fallthrough */
default: /* things that break a sequence */
case OBOW: /* things that break a sequence */
case OEOW:
case OBOL:
case OEOL:
case O_QUEST:
case O_CH:
case OEND:
if (newlen > g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
if (offset > -1) {
g->moffset += offset;
offset = newlen;
} else
g->moffset = offset;
} else {
if (offset > -1)
offset += newlen;
}
newlen = 0;
break;
case OANY:
if (newlen > g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
if (offset > -1) {
g->moffset += offset;
offset = newlen;
} else
g->moffset = offset;
} else {
if (offset > -1)
offset += newlen;
}
if (offset > -1)
offset++;
newlen = 0;
break;
case OANYOF: /* may or may not invalidate offset */
/* First, everything as OANY */
if (newlen > g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
if (offset > -1) {
g->moffset += offset;
offset = newlen;
} else
g->moffset = offset;
} else {
if (offset > -1)
offset += newlen;
}
if (offset > -1)
offset++;
newlen = 0;
/* And, now, if we found out we can't deal with
* it, make offset = -1.
*/
if (mccs)
offset = -1;
break;
default:
/* Anything here makes it impossible or too hard
* to calculate the offset -- so we give up;
* save the last known good offset, in case the
* must sequence doesn't occur later.
*/
if (newlen > g->mlen) { /* ends one */
start = newstart;
g->mlen = newlen;
if (offset > -1)
g->moffset += offset;
else
g->moffset = offset;
}
offset = -1;
newlen = 0;
break;
}
} while (OP(s) != OEND);
if (g->mlen == 0) /* there isn't one */
if (g->mlen == 0) { /* there isn't one */
g->moffset = -1;
return;
}
/* turn it into a character string */
g->must = malloc((size_t)g->mlen + 1);
if (g->must == NULL) { /* argh; just forget it */
g->mlen = 0;
g->moffset = -1;
return;
}
cp = g->must;
@ -1760,6 +1847,78 @@ register struct re_guts *g;
*cp++ = '\0'; /* just on general principles */
}
/*
- altoffset - choose biggest offset among multiple choices
= static int altoffset(sop *scan, int offset, int mccs);
*
* Compute, recursively if necessary, the largest offset among multiple
* re paths.
*/
static int
altoffset(scan, offset, mccs)
sop *scan;
int offset;
int mccs;
{
int largest;
int try;
sop s;
/* If we gave up already on offsets, return */
if (offset == -1)
return -1;
largest = 0;
try = 0;
s = *scan++;
while (OP(s) != O_QUEST && OP(s) != O_CH) {
switch (OP(s)) {
case OOR1:
if (try > largest)
largest = try;
try = 0;
break;
case OQUEST_:
case OCH_:
try = altoffset(scan, try, mccs);
if (try == -1)
return -1;
scan--;
do {
scan += OPND(s);
s = *scan;
if (OP(s) != O_QUEST && OP(s) != O_CH &&
OP(s) != OOR2)
return -1;
} while (OP(s) != O_QUEST && OP(s) != O_CH);
break;
case OANYOF:
if (mccs)
return -1;
case OCHAR:
case OANY:
try++;
case OBOW:
case OEOW:
case OLPAREN:
case ORPAREN:
case OOR2:
break;
default:
try = -1;
break;
}
if (try == -1)
return -1;
s = *scan++;
}
if (try > largest)
largest = try;
return largest+offset;
}
/*
- computejumps - compute char jumps for BM scan
== static void computejumps(register struct parse *p, register struct re_guts *g);
@ -1783,7 +1942,7 @@ struct re_guts *g;
if (p->error != 0)
return;
g->charjump = malloc(256 * sizeof(int));
g->charjump = malloc((UCHAR_MAX+1) * sizeof(int));
if (g->charjump == NULL) /* Not a fatal error */
return;
@ -1874,8 +2033,8 @@ struct re_guts *g;
g->mlen + suffix - mindex);
ssuffix = pmatches[suffix];
while(suffix < g->mlen) {
while(suffix <= ssuffix) {
while (suffix < g->mlen) {
while (suffix <= ssuffix) {
g->matchjump[suffix] = MIN(g->matchjump[suffix],
g->mlen + ssuffix - suffix);
suffix++;

View File

@ -162,6 +162,7 @@ struct re_guts {
int ncategories; /* how many character categories */
cat_t *categories; /* ->catspace[-CHAR_MIN] */
char *must; /* match must contain this string */
int moffset; /* latest point at which must may be located */
int *charjump; /* Boyer-Moore char jump table */
int *matchjump; /* Boyer-Moore match jump table */
int mlen; /* length of must */