1994-05-27 05:00:24 +00:00
|
|
|
/*-
|
|
|
|
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
|
|
|
* Copyright (c) 1992, 1993, 1994
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
*
|
2011-11-20 14:45:42 +00:00
|
|
|
* Copyright (c) 2011 The FreeBSD Foundation
|
|
|
|
* All rights reserved.
|
|
|
|
* Portions of this software were developed by David Chisnall
|
|
|
|
* under sponsorship from the FreeBSD Foundation.
|
|
|
|
*
|
1994-05-27 05:00:24 +00:00
|
|
|
* This code is derived from software contributed to Berkeley by
|
|
|
|
* Henry Spencer.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
|
|
|
* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
|
|
|
|
*/
|
|
|
|
|
|
|
|
#if defined(LIBC_SCCS) && !defined(lint)
|
|
|
|
static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94";
|
|
|
|
#endif /* LIBC_SCCS and not lint */
|
2002-03-22 21:53:29 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <ctype.h>
|
|
|
|
#include <limits.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <regex.h>
|
2004-10-03 15:42:59 +00:00
|
|
|
#include <runetype.h>
|
2004-07-12 07:35:59 +00:00
|
|
|
#include <wchar.h>
|
|
|
|
#include <wctype.h>
|
1994-05-27 05:00:24 +00:00
|
|
|
|
1996-10-31 04:38:21 +00:00
|
|
|
#include "collate.h"
|
|
|
|
|
1994-05-27 05:00:24 +00:00
|
|
|
#include "utils.h"
|
|
|
|
#include "regex2.h"
|
|
|
|
|
|
|
|
#include "cname.h"
|
|
|
|
|
|
|
|
/*
|
|
|
|
* parse structure, passed up and down to avoid global variables and
|
|
|
|
* other clumsinesses
|
|
|
|
*/
|
|
|
|
struct parse {
|
|
|
|
char *next; /* next character in RE */
|
|
|
|
char *end; /* end of string (-> NUL normally) */
|
|
|
|
int error; /* has an error been seen? */
|
|
|
|
sop *strip; /* malloced strip */
|
|
|
|
sopno ssize; /* malloced strip size (allocated) */
|
|
|
|
sopno slen; /* malloced strip length (used) */
|
|
|
|
int ncsalloc; /* number of csets allocated */
|
|
|
|
struct re_guts *g;
|
|
|
|
# define NPAREN 10 /* we need to remember () 1-9 for back refs */
|
|
|
|
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
|
|
|
|
sopno pend[NPAREN]; /* -> ) ([0] unused) */
|
|
|
|
};
|
|
|
|
|
|
|
|
/* ========= begin header generated by ./mkh ========= */
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* === regcomp.c === */
|
2011-11-11 01:35:07 +00:00
|
|
|
static void p_ere(struct parse *p, int stop);
|
2002-03-21 22:49:10 +00:00
|
|
|
static void p_ere_exp(struct parse *p);
|
|
|
|
static void p_str(struct parse *p);
|
2011-11-11 01:35:07 +00:00
|
|
|
static void p_bre(struct parse *p, int end1, int end2);
|
2002-03-21 22:49:10 +00:00
|
|
|
static int p_simp_re(struct parse *p, int starordinary);
|
|
|
|
static int p_count(struct parse *p);
|
|
|
|
static void p_bracket(struct parse *p);
|
|
|
|
static void p_b_term(struct parse *p, cset *cs);
|
|
|
|
static void p_b_cclass(struct parse *p, cset *cs);
|
|
|
|
static void p_b_eclass(struct parse *p, cset *cs);
|
2004-07-12 07:35:59 +00:00
|
|
|
static wint_t p_b_symbol(struct parse *p);
|
|
|
|
static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
|
|
|
|
static wint_t othercase(wint_t ch);
|
|
|
|
static void bothcases(struct parse *p, wint_t ch);
|
|
|
|
static void ordinary(struct parse *p, wint_t ch);
|
2002-03-21 22:49:10 +00:00
|
|
|
static void nonnewline(struct parse *p);
|
|
|
|
static void repeat(struct parse *p, sopno start, int from, int to);
|
|
|
|
static int seterr(struct parse *p, int e);
|
|
|
|
static cset *allocset(struct parse *p);
|
|
|
|
static void freeset(struct parse *p, cset *cs);
|
2004-07-12 07:35:59 +00:00
|
|
|
static void CHadd(struct parse *p, cset *cs, wint_t ch);
|
|
|
|
static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
|
|
|
|
static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
|
|
|
|
static wint_t singleton(cset *cs);
|
2002-03-21 22:49:10 +00:00
|
|
|
static sopno dupl(struct parse *p, sopno start, sopno finish);
|
|
|
|
static void doemit(struct parse *p, sop op, size_t opnd);
|
|
|
|
static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
|
|
|
|
static void dofwd(struct parse *p, sopno pos, sop value);
|
2011-11-10 01:44:05 +00:00
|
|
|
static int enlarge(struct parse *p, sopno size);
|
2002-03-21 22:49:10 +00:00
|
|
|
static void stripsnug(struct parse *p, struct re_guts *g);
|
|
|
|
static void findmust(struct parse *p, struct re_guts *g);
|
2004-07-11 05:58:31 +00:00
|
|
|
static int altoffset(sop *scan, int offset);
|
2002-03-21 22:49:10 +00:00
|
|
|
static void computejumps(struct parse *p, struct re_guts *g);
|
|
|
|
static void computematchjumps(struct parse *p, struct re_guts *g);
|
|
|
|
static sopno pluscount(struct parse *p, struct re_guts *g);
|
2004-07-12 07:35:59 +00:00
|
|
|
static wint_t wgetnext(struct parse *p);
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
/* ========= end header generated by ./mkh ========= */
|
|
|
|
|
|
|
|
static char nuls[10]; /* place to point scanner in event of error */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* macros for use with parse structure
|
|
|
|
* BEWARE: these know that the parse structure is named `p' !!!
|
|
|
|
*/
|
|
|
|
#define PEEK() (*p->next)
|
|
|
|
#define PEEK2() (*(p->next+1))
|
|
|
|
#define MORE() (p->next < p->end)
|
|
|
|
#define MORE2() (p->next+1 < p->end)
|
|
|
|
#define SEE(c) (MORE() && PEEK() == (c))
|
|
|
|
#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
|
|
|
|
#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
|
|
|
|
#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
|
|
|
|
#define NEXT() (p->next++)
|
|
|
|
#define NEXT2() (p->next += 2)
|
|
|
|
#define NEXTn(n) (p->next += (n))
|
|
|
|
#define GETNEXT() (*p->next++)
|
2004-07-12 07:35:59 +00:00
|
|
|
#define WGETNEXT() wgetnext(p)
|
1994-05-27 05:00:24 +00:00
|
|
|
#define SETERROR(e) seterr(p, (e))
|
|
|
|
#define REQUIRE(co, e) ((co) || SETERROR(e))
|
|
|
|
#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
|
|
|
|
#define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e))
|
|
|
|
#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e))
|
|
|
|
#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
|
|
|
|
#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
|
|
|
|
#define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
|
|
|
|
#define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
|
|
|
|
#define HERE() (p->slen)
|
|
|
|
#define THERE() (p->slen - 1)
|
|
|
|
#define THERETHERE() (p->slen - 2)
|
|
|
|
#define DROP(n) (p->slen -= (n))
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
static int never = 0; /* for use in asserts; shuts lint up */
|
|
|
|
#else
|
|
|
|
#define never 0 /* some <assert.h>s have bugs too */
|
|
|
|
#endif
|
|
|
|
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
/* Macro used by computejump()/computematchjump() */
|
|
|
|
#define MIN(a,b) ((a)<(b)?(a):(b))
|
|
|
|
|
1994-05-27 05:00:24 +00:00
|
|
|
/*
|
|
|
|
- regcomp - interface for parser and compilation
|
|
|
|
= extern int regcomp(regex_t *, const char *, int);
|
|
|
|
= #define REG_BASIC 0000
|
|
|
|
= #define REG_EXTENDED 0001
|
|
|
|
= #define REG_ICASE 0002
|
|
|
|
= #define REG_NOSUB 0004
|
|
|
|
= #define REG_NEWLINE 0010
|
|
|
|
= #define REG_NOSPEC 0020
|
|
|
|
= #define REG_PEND 0040
|
|
|
|
= #define REG_DUMP 0200
|
|
|
|
*/
|
|
|
|
int /* 0 success, otherwise REG_something */
|
2007-06-11 03:05:54 +00:00
|
|
|
regcomp(regex_t * __restrict preg,
|
|
|
|
const char * __restrict pattern,
|
|
|
|
int cflags)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
|
|
|
struct parse pa;
|
2002-03-21 18:49:23 +00:00
|
|
|
struct re_guts *g;
|
|
|
|
struct parse *p = &pa;
|
|
|
|
int i;
|
|
|
|
size_t len;
|
1994-05-27 05:00:24 +00:00
|
|
|
#ifdef REDEBUG
|
|
|
|
# define GOODFLAGS(f) (f)
|
|
|
|
#else
|
|
|
|
# define GOODFLAGS(f) ((f)&~REG_DUMP)
|
|
|
|
#endif
|
|
|
|
|
|
|
|
cflags = GOODFLAGS(cflags);
|
|
|
|
if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
|
|
|
|
return(REG_INVARG);
|
|
|
|
|
|
|
|
if (cflags®_PEND) {
|
|
|
|
if (preg->re_endp < pattern)
|
|
|
|
return(REG_INVARG);
|
|
|
|
len = preg->re_endp - pattern;
|
|
|
|
} else
|
|
|
|
len = strlen((char *)pattern);
|
|
|
|
|
|
|
|
/* do the mallocs early so failure handling is easy */
|
2004-07-11 05:58:31 +00:00
|
|
|
g = (struct re_guts *)malloc(sizeof(struct re_guts));
|
1994-05-27 05:00:24 +00:00
|
|
|
if (g == NULL)
|
|
|
|
return(REG_ESPACE);
|
|
|
|
p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
|
|
|
|
p->strip = (sop *)malloc(p->ssize * sizeof(sop));
|
|
|
|
p->slen = 0;
|
|
|
|
if (p->strip == NULL) {
|
|
|
|
free((char *)g);
|
|
|
|
return(REG_ESPACE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* set things up */
|
|
|
|
p->g = g;
|
|
|
|
p->next = (char *)pattern; /* convenience; we do not modify it */
|
|
|
|
p->end = p->next + len;
|
|
|
|
p->error = 0;
|
|
|
|
p->ncsalloc = 0;
|
|
|
|
for (i = 0; i < NPAREN; i++) {
|
|
|
|
p->pbegin[i] = 0;
|
|
|
|
p->pend[i] = 0;
|
|
|
|
}
|
|
|
|
g->sets = NULL;
|
|
|
|
g->ncsets = 0;
|
|
|
|
g->cflags = cflags;
|
|
|
|
g->iflags = 0;
|
|
|
|
g->nbol = 0;
|
|
|
|
g->neol = 0;
|
|
|
|
g->must = NULL;
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
g->moffset = -1;
|
2000-06-29 18:53:55 +00:00
|
|
|
g->charjump = NULL;
|
|
|
|
g->matchjump = NULL;
|
1994-05-27 05:00:24 +00:00
|
|
|
g->mlen = 0;
|
|
|
|
g->nsub = 0;
|
|
|
|
g->backrefs = 0;
|
|
|
|
|
|
|
|
/* do it */
|
|
|
|
EMIT(OEND, 0);
|
|
|
|
g->firststate = THERE();
|
|
|
|
if (cflags®_EXTENDED)
|
|
|
|
p_ere(p, OUT);
|
|
|
|
else if (cflags®_NOSPEC)
|
|
|
|
p_str(p);
|
|
|
|
else
|
|
|
|
p_bre(p, OUT, OUT);
|
|
|
|
EMIT(OEND, 0);
|
|
|
|
g->laststate = THERE();
|
|
|
|
|
|
|
|
/* tidy up loose ends and fill things in */
|
|
|
|
stripsnug(p, g);
|
|
|
|
findmust(p, g);
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
/* only use Boyer-Moore algorithm if the pattern is bigger
|
|
|
|
* than three characters
|
|
|
|
*/
|
|
|
|
if(g->mlen > 3) {
|
|
|
|
computejumps(p, g);
|
|
|
|
computematchjumps(p, g);
|
2000-07-07 07:47:39 +00:00
|
|
|
if(g->matchjump == NULL && g->charjump != NULL) {
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
free(g->charjump);
|
|
|
|
g->charjump = NULL;
|
|
|
|
}
|
|
|
|
}
|
1994-05-27 05:00:24 +00:00
|
|
|
g->nplus = pluscount(p, g);
|
|
|
|
g->magic = MAGIC2;
|
|
|
|
preg->re_nsub = g->nsub;
|
|
|
|
preg->re_g = g;
|
|
|
|
preg->re_magic = MAGIC1;
|
|
|
|
#ifndef REDEBUG
|
|
|
|
/* not debugging, so can't rely on the assert() in regexec() */
|
|
|
|
if (g->iflags&BAD)
|
|
|
|
SETERROR(REG_ASSERT);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* win or lose, we're done */
|
|
|
|
if (p->error != 0) /* lose */
|
|
|
|
regfree(preg);
|
|
|
|
return(p->error);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_ere - ERE parser top level, concatenation and alternation
|
2011-11-11 01:35:07 +00:00
|
|
|
== static void p_ere(struct parse *p, int_t stop);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
p_ere(struct parse *p,
|
2011-11-11 01:35:07 +00:00
|
|
|
int stop) /* character this ERE should end at */
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
char c;
|
|
|
|
sopno prevback;
|
|
|
|
sopno prevfwd;
|
|
|
|
sopno conc;
|
|
|
|
int first = 1; /* is this the first alternative? */
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
for (;;) {
|
|
|
|
/* do a bunch of concatenated expressions */
|
|
|
|
conc = HERE();
|
|
|
|
while (MORE() && (c = PEEK()) != '|' && c != stop)
|
|
|
|
p_ere_exp(p);
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
if (!EAT('|'))
|
|
|
|
break; /* NOTE BREAK OUT */
|
|
|
|
|
|
|
|
if (first) {
|
|
|
|
INSERT(OCH_, conc); /* offset is wrong */
|
|
|
|
prevfwd = conc;
|
|
|
|
prevback = conc;
|
|
|
|
first = 0;
|
|
|
|
}
|
|
|
|
ASTERN(OOR1, prevback);
|
|
|
|
prevback = THERE();
|
|
|
|
AHEAD(prevfwd); /* fix previous offset */
|
|
|
|
prevfwd = HERE();
|
|
|
|
EMIT(OOR2, 0); /* offset is very wrong */
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!first) { /* tail-end fixups */
|
|
|
|
AHEAD(prevfwd);
|
|
|
|
ASTERN(O_CH, prevback);
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(!MORE() || SEE(stop));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void p_ere_exp(struct parse *p);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
p_ere_exp(struct parse *p)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
char c;
|
2004-07-12 07:35:59 +00:00
|
|
|
wint_t wc;
|
2002-03-21 18:49:23 +00:00
|
|
|
sopno pos;
|
|
|
|
int count;
|
|
|
|
int count2;
|
|
|
|
sopno subno;
|
1994-05-27 05:00:24 +00:00
|
|
|
int wascaret = 0;
|
|
|
|
|
|
|
|
assert(MORE()); /* caller should have ensured this */
|
|
|
|
c = GETNEXT();
|
|
|
|
|
|
|
|
pos = HERE();
|
|
|
|
switch (c) {
|
|
|
|
case '(':
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EPAREN);
|
1994-05-27 05:00:24 +00:00
|
|
|
p->g->nsub++;
|
|
|
|
subno = p->g->nsub;
|
|
|
|
if (subno < NPAREN)
|
|
|
|
p->pbegin[subno] = HERE();
|
|
|
|
EMIT(OLPAREN, subno);
|
|
|
|
if (!SEE(')'))
|
|
|
|
p_ere(p, ')');
|
|
|
|
if (subno < NPAREN) {
|
|
|
|
p->pend[subno] = HERE();
|
|
|
|
assert(p->pend[subno] != 0);
|
|
|
|
}
|
|
|
|
EMIT(ORPAREN, subno);
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)MUSTEAT(')', REG_EPAREN);
|
1994-05-27 05:00:24 +00:00
|
|
|
break;
|
|
|
|
#ifndef POSIX_MISTAKE
|
|
|
|
case ')': /* happens only if no current unmatched ( */
|
|
|
|
/*
|
|
|
|
* You may ask, why the ifndef? Because I didn't notice
|
|
|
|
* this until slightly too late for 1003.2, and none of the
|
|
|
|
* other 1003.2 regular-expression reviewers noticed it at
|
|
|
|
* all. So an unmatched ) is legal POSIX, at least until
|
|
|
|
* we can get it fixed.
|
|
|
|
*/
|
|
|
|
SETERROR(REG_EPAREN);
|
|
|
|
break;
|
|
|
|
#endif
|
|
|
|
case '^':
|
|
|
|
EMIT(OBOL, 0);
|
|
|
|
p->g->iflags |= USEBOL;
|
|
|
|
p->g->nbol++;
|
|
|
|
wascaret = 1;
|
|
|
|
break;
|
|
|
|
case '$':
|
|
|
|
EMIT(OEOL, 0);
|
|
|
|
p->g->iflags |= USEEOL;
|
|
|
|
p->g->neol++;
|
|
|
|
break;
|
|
|
|
case '|':
|
|
|
|
SETERROR(REG_EMPTY);
|
|
|
|
break;
|
|
|
|
case '*':
|
|
|
|
case '+':
|
|
|
|
case '?':
|
|
|
|
SETERROR(REG_BADRPT);
|
|
|
|
break;
|
|
|
|
case '.':
|
|
|
|
if (p->g->cflags®_NEWLINE)
|
|
|
|
nonnewline(p);
|
|
|
|
else
|
|
|
|
EMIT(OANY, 0);
|
|
|
|
break;
|
|
|
|
case '[':
|
|
|
|
p_bracket(p);
|
|
|
|
break;
|
|
|
|
case '\\':
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EESCAPE);
|
2004-07-12 07:35:59 +00:00
|
|
|
wc = WGETNEXT();
|
|
|
|
ordinary(p, wc);
|
1994-05-27 05:00:24 +00:00
|
|
|
break;
|
|
|
|
case '{': /* okay as ordinary except if digit follows */
|
1999-07-26 01:33:38 +00:00
|
|
|
(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
|
1994-05-27 05:00:24 +00:00
|
|
|
/* FALLTHROUGH */
|
|
|
|
default:
|
2004-07-12 07:35:59 +00:00
|
|
|
p->next--;
|
|
|
|
wc = WGETNEXT();
|
|
|
|
ordinary(p, wc);
|
1994-05-27 05:00:24 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!MORE())
|
|
|
|
return;
|
|
|
|
c = PEEK();
|
|
|
|
/* we call { a repetition if followed by a digit */
|
|
|
|
if (!( c == '*' || c == '+' || c == '?' ||
|
1999-07-26 01:33:38 +00:00
|
|
|
(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
|
1994-05-27 05:00:24 +00:00
|
|
|
return; /* no repetition, we're done */
|
|
|
|
NEXT();
|
|
|
|
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(!wascaret, REG_BADRPT);
|
1994-05-27 05:00:24 +00:00
|
|
|
switch (c) {
|
|
|
|
case '*': /* implemented as +? */
|
|
|
|
/* this case does not require the (y|) trick, noKLUDGE */
|
|
|
|
INSERT(OPLUS_, pos);
|
|
|
|
ASTERN(O_PLUS, pos);
|
|
|
|
INSERT(OQUEST_, pos);
|
|
|
|
ASTERN(O_QUEST, pos);
|
|
|
|
break;
|
|
|
|
case '+':
|
|
|
|
INSERT(OPLUS_, pos);
|
|
|
|
ASTERN(O_PLUS, pos);
|
|
|
|
break;
|
|
|
|
case '?':
|
|
|
|
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
|
|
|
|
INSERT(OCH_, pos); /* offset slightly wrong */
|
|
|
|
ASTERN(OOR1, pos); /* this one's right */
|
|
|
|
AHEAD(pos); /* fix the OCH_ */
|
|
|
|
EMIT(OOR2, 0); /* offset very wrong... */
|
|
|
|
AHEAD(THERE()); /* ...so fix it */
|
|
|
|
ASTERN(O_CH, THERETHERE());
|
|
|
|
break;
|
|
|
|
case '{':
|
|
|
|
count = p_count(p);
|
|
|
|
if (EAT(',')) {
|
1999-07-26 01:33:38 +00:00
|
|
|
if (isdigit((uch)PEEK())) {
|
1994-05-27 05:00:24 +00:00
|
|
|
count2 = p_count(p);
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(count <= count2, REG_BADBR);
|
1994-05-27 05:00:24 +00:00
|
|
|
} else /* single number with comma */
|
|
|
|
count2 = INFINITY;
|
|
|
|
} else /* just a single number */
|
|
|
|
count2 = count;
|
|
|
|
repeat(p, pos, count, count2);
|
|
|
|
if (!EAT('}')) { /* error heuristics */
|
|
|
|
while (MORE() && PEEK() != '}')
|
|
|
|
NEXT();
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EBRACE);
|
1994-05-27 05:00:24 +00:00
|
|
|
SETERROR(REG_BADBR);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!MORE())
|
|
|
|
return;
|
|
|
|
c = PEEK();
|
|
|
|
if (!( c == '*' || c == '+' || c == '?' ||
|
1999-07-26 01:33:38 +00:00
|
|
|
(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
|
1994-05-27 05:00:24 +00:00
|
|
|
return;
|
|
|
|
SETERROR(REG_BADRPT);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_str - string (no metacharacters) "parser"
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void p_str(struct parse *p);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
p_str(struct parse *p)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EMPTY);
|
1994-05-27 05:00:24 +00:00
|
|
|
while (MORE())
|
2004-07-12 07:35:59 +00:00
|
|
|
ordinary(p, WGETNEXT());
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_bre - BRE parser top level, anchoring and concatenation
|
2011-11-11 01:35:07 +00:00
|
|
|
== static void p_bre(struct parse *p, int end1, \
|
|
|
|
== int end2);
|
1994-05-27 05:00:24 +00:00
|
|
|
* Giving end1 as OUT essentially eliminates the end1/end2 check.
|
|
|
|
*
|
|
|
|
* This implementation is a bit of a kludge, in that a trailing $ is first
|
2004-07-11 05:58:31 +00:00
|
|
|
* taken as an ordinary character and then revised to be an anchor.
|
1994-05-27 05:00:24 +00:00
|
|
|
* The amount of lookahead needed to avoid this kludge is excessive.
|
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
p_bre(struct parse *p,
|
2011-11-11 01:35:07 +00:00
|
|
|
int end1, /* first terminating character */
|
|
|
|
int end2) /* second terminating character */
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
sopno start = HERE();
|
|
|
|
int first = 1; /* first subexpression? */
|
|
|
|
int wasdollar = 0;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
if (EAT('^')) {
|
|
|
|
EMIT(OBOL, 0);
|
|
|
|
p->g->iflags |= USEBOL;
|
|
|
|
p->g->nbol++;
|
|
|
|
}
|
|
|
|
while (MORE() && !SEETWO(end1, end2)) {
|
|
|
|
wasdollar = p_simp_re(p, first);
|
|
|
|
first = 0;
|
|
|
|
}
|
|
|
|
if (wasdollar) { /* oops, that was a trailing anchor */
|
|
|
|
DROP(1);
|
|
|
|
EMIT(OEOL, 0);
|
|
|
|
p->g->iflags |= USEEOL;
|
|
|
|
p->g->neol++;
|
|
|
|
}
|
|
|
|
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
|
2002-03-21 18:49:23 +00:00
|
|
|
== static int p_simp_re(struct parse *p, int starordinary);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static int /* was the simple RE an unbackslashed $? */
|
2007-06-11 03:05:54 +00:00
|
|
|
p_simp_re(struct parse *p,
|
|
|
|
int starordinary) /* is a leading * an ordinary character? */
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
int c;
|
|
|
|
int count;
|
|
|
|
int count2;
|
|
|
|
sopno pos;
|
|
|
|
int i;
|
2004-07-12 07:35:59 +00:00
|
|
|
wint_t wc;
|
2002-03-21 18:49:23 +00:00
|
|
|
sopno subno;
|
1994-05-27 05:00:24 +00:00
|
|
|
# define BACKSL (1<<CHAR_BIT)
|
|
|
|
|
|
|
|
pos = HERE(); /* repetion op, if any, covers from here */
|
|
|
|
|
|
|
|
assert(MORE()); /* caller should have ensured this */
|
|
|
|
c = GETNEXT();
|
|
|
|
if (c == '\\') {
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EESCAPE);
|
1999-07-26 01:33:38 +00:00
|
|
|
c = BACKSL | GETNEXT();
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
switch (c) {
|
|
|
|
case '.':
|
|
|
|
if (p->g->cflags®_NEWLINE)
|
|
|
|
nonnewline(p);
|
|
|
|
else
|
|
|
|
EMIT(OANY, 0);
|
|
|
|
break;
|
|
|
|
case '[':
|
|
|
|
p_bracket(p);
|
|
|
|
break;
|
|
|
|
case BACKSL|'{':
|
|
|
|
SETERROR(REG_BADRPT);
|
|
|
|
break;
|
|
|
|
case BACKSL|'(':
|
|
|
|
p->g->nsub++;
|
|
|
|
subno = p->g->nsub;
|
|
|
|
if (subno < NPAREN)
|
|
|
|
p->pbegin[subno] = HERE();
|
|
|
|
EMIT(OLPAREN, subno);
|
|
|
|
/* the MORE here is an error heuristic */
|
|
|
|
if (MORE() && !SEETWO('\\', ')'))
|
|
|
|
p_bre(p, '\\', ')');
|
|
|
|
if (subno < NPAREN) {
|
|
|
|
p->pend[subno] = HERE();
|
|
|
|
assert(p->pend[subno] != 0);
|
|
|
|
}
|
|
|
|
EMIT(ORPAREN, subno);
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
|
1994-05-27 05:00:24 +00:00
|
|
|
break;
|
|
|
|
case BACKSL|')': /* should not get here -- must be user */
|
|
|
|
case BACKSL|'}':
|
|
|
|
SETERROR(REG_EPAREN);
|
|
|
|
break;
|
|
|
|
case BACKSL|'1':
|
|
|
|
case BACKSL|'2':
|
|
|
|
case BACKSL|'3':
|
|
|
|
case BACKSL|'4':
|
|
|
|
case BACKSL|'5':
|
|
|
|
case BACKSL|'6':
|
|
|
|
case BACKSL|'7':
|
|
|
|
case BACKSL|'8':
|
|
|
|
case BACKSL|'9':
|
|
|
|
i = (c&~BACKSL) - '0';
|
|
|
|
assert(i < NPAREN);
|
|
|
|
if (p->pend[i] != 0) {
|
|
|
|
assert(i <= p->g->nsub);
|
|
|
|
EMIT(OBACK_, i);
|
|
|
|
assert(p->pbegin[i] != 0);
|
|
|
|
assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
|
|
|
|
assert(OP(p->strip[p->pend[i]]) == ORPAREN);
|
|
|
|
(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
|
|
|
|
EMIT(O_BACK, i);
|
|
|
|
} else
|
|
|
|
SETERROR(REG_ESUBREG);
|
|
|
|
p->g->backrefs = 1;
|
|
|
|
break;
|
|
|
|
case '*':
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(starordinary, REG_BADRPT);
|
1994-05-27 05:00:24 +00:00
|
|
|
/* FALLTHROUGH */
|
|
|
|
default:
|
2004-07-12 07:35:59 +00:00
|
|
|
p->next--;
|
|
|
|
wc = WGETNEXT();
|
|
|
|
ordinary(p, wc);
|
1994-05-27 05:00:24 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (EAT('*')) { /* implemented as +? */
|
|
|
|
/* this case does not require the (y|) trick, noKLUDGE */
|
|
|
|
INSERT(OPLUS_, pos);
|
|
|
|
ASTERN(O_PLUS, pos);
|
|
|
|
INSERT(OQUEST_, pos);
|
|
|
|
ASTERN(O_QUEST, pos);
|
|
|
|
} else if (EATTWO('\\', '{')) {
|
|
|
|
count = p_count(p);
|
|
|
|
if (EAT(',')) {
|
1999-07-26 01:33:38 +00:00
|
|
|
if (MORE() && isdigit((uch)PEEK())) {
|
1994-05-27 05:00:24 +00:00
|
|
|
count2 = p_count(p);
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(count <= count2, REG_BADBR);
|
1994-05-27 05:00:24 +00:00
|
|
|
} else /* single number with comma */
|
|
|
|
count2 = INFINITY;
|
|
|
|
} else /* just a single number */
|
|
|
|
count2 = count;
|
|
|
|
repeat(p, pos, count, count2);
|
|
|
|
if (!EATTWO('\\', '}')) { /* error heuristics */
|
|
|
|
while (MORE() && !SEETWO('\\', '}'))
|
|
|
|
NEXT();
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EBRACE);
|
1994-05-27 05:00:24 +00:00
|
|
|
SETERROR(REG_BADBR);
|
|
|
|
}
|
1999-07-26 01:33:38 +00:00
|
|
|
} else if (c == '$') /* $ (but not \$) ends it */
|
1994-05-27 05:00:24 +00:00
|
|
|
return(1);
|
|
|
|
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_count - parse a repetition count
|
2002-03-21 18:49:23 +00:00
|
|
|
== static int p_count(struct parse *p);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static int /* the value */
|
2007-06-11 03:05:54 +00:00
|
|
|
p_count(struct parse *p)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
int count = 0;
|
|
|
|
int ndigits = 0;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
1999-07-26 01:33:38 +00:00
|
|
|
while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
|
1994-05-27 05:00:24 +00:00
|
|
|
count = count*10 + (GETNEXT() - '0');
|
|
|
|
ndigits++;
|
|
|
|
}
|
|
|
|
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
|
1994-05-27 05:00:24 +00:00
|
|
|
return(count);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_bracket - parse a bracketed character list
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void p_bracket(struct parse *p);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
p_bracket(struct parse *p)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2004-07-12 07:35:59 +00:00
|
|
|
cset *cs;
|
|
|
|
wint_t ch;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
/* Dept of Truly Sickening Special-Case Kludges */
|
|
|
|
if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
|
|
|
|
EMIT(OBOW, 0);
|
|
|
|
NEXTn(6);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
|
|
|
|
EMIT(OEOW, 0);
|
|
|
|
NEXTn(6);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2004-07-12 07:35:59 +00:00
|
|
|
if ((cs = allocset(p)) == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (p->g->cflags®_ICASE)
|
|
|
|
cs->icase = 1;
|
1994-05-27 05:00:24 +00:00
|
|
|
if (EAT('^'))
|
2004-07-12 07:35:59 +00:00
|
|
|
cs->invert = 1;
|
1994-05-27 05:00:24 +00:00
|
|
|
if (EAT(']'))
|
2004-07-12 07:35:59 +00:00
|
|
|
CHadd(p, cs, ']');
|
1994-05-27 05:00:24 +00:00
|
|
|
else if (EAT('-'))
|
2004-07-12 07:35:59 +00:00
|
|
|
CHadd(p, cs, '-');
|
1994-05-27 05:00:24 +00:00
|
|
|
while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
|
|
|
|
p_b_term(p, cs);
|
|
|
|
if (EAT('-'))
|
2004-07-12 07:35:59 +00:00
|
|
|
CHadd(p, cs, '-');
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)MUSTEAT(']', REG_EBRACK);
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
if (p->error != 0) /* don't mess things up further */
|
|
|
|
return;
|
|
|
|
|
2004-07-12 07:35:59 +00:00
|
|
|
if (cs->invert && p->g->cflags®_NEWLINE)
|
|
|
|
cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
|
1994-05-27 05:00:24 +00:00
|
|
|
|
2004-07-12 07:35:59 +00:00
|
|
|
if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
|
|
|
|
ordinary(p, ch);
|
1994-05-27 05:00:24 +00:00
|
|
|
freeset(p, cs);
|
|
|
|
} else
|
2004-07-12 07:35:59 +00:00
|
|
|
EMIT(OANYOF, (int)(cs - p->g->sets));
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_b_term - parse one term of a bracketed character list
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void p_b_term(struct parse *p, cset *cs);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
p_b_term(struct parse *p, cset *cs)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
char c;
|
2004-07-12 07:35:59 +00:00
|
|
|
wint_t start, finish;
|
|
|
|
wint_t i;
|
2011-11-20 14:45:42 +00:00
|
|
|
struct xlocale_collate *table =
|
|
|
|
(struct xlocale_collate*)__get_locale()->components[XLC_COLLATE];
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
/* classify what we've got */
|
|
|
|
switch ((MORE()) ? PEEK() : '\0') {
|
|
|
|
case '[':
|
|
|
|
c = (MORE2()) ? PEEK2() : '\0';
|
|
|
|
break;
|
|
|
|
case '-':
|
|
|
|
SETERROR(REG_ERANGE);
|
|
|
|
return; /* NOTE RETURN */
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
c = '\0';
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (c) {
|
|
|
|
case ':': /* character class */
|
|
|
|
NEXT2();
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EBRACK);
|
1994-05-27 05:00:24 +00:00
|
|
|
c = PEEK();
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
|
1994-05-27 05:00:24 +00:00
|
|
|
p_b_cclass(p, cs);
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EBRACK);
|
|
|
|
(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
|
1994-05-27 05:00:24 +00:00
|
|
|
break;
|
|
|
|
case '=': /* equivalence class */
|
|
|
|
NEXT2();
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EBRACK);
|
1994-05-27 05:00:24 +00:00
|
|
|
c = PEEK();
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
|
1994-05-27 05:00:24 +00:00
|
|
|
p_b_eclass(p, cs);
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EBRACK);
|
|
|
|
(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
|
1994-05-27 05:00:24 +00:00
|
|
|
break;
|
|
|
|
default: /* symbol, ordinary character, or range */
|
|
|
|
start = p_b_symbol(p);
|
|
|
|
if (SEE('-') && MORE2() && PEEK2() != ']') {
|
|
|
|
/* range */
|
|
|
|
NEXT();
|
|
|
|
if (EAT('-'))
|
|
|
|
finish = '-';
|
|
|
|
else
|
|
|
|
finish = p_b_symbol(p);
|
|
|
|
} else
|
|
|
|
finish = start;
|
1996-08-11 16:08:17 +00:00
|
|
|
if (start == finish)
|
2004-07-12 07:35:59 +00:00
|
|
|
CHadd(p, cs, start);
|
1996-08-11 16:08:17 +00:00
|
|
|
else {
|
2011-11-20 14:45:42 +00:00
|
|
|
if (table->__collate_load_error) {
|
1997-04-04 19:40:49 +00:00
|
|
|
(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
|
2004-07-12 07:35:59 +00:00
|
|
|
CHaddrange(p, cs, start, finish);
|
1997-04-04 19:40:49 +00:00
|
|
|
} else {
|
2011-11-20 14:45:42 +00:00
|
|
|
(void)REQUIRE(__collate_range_cmp(table, start, finish) <= 0, REG_ERANGE);
|
2004-07-12 07:35:59 +00:00
|
|
|
for (i = 0; i <= UCHAR_MAX; i++) {
|
2011-11-20 14:45:42 +00:00
|
|
|
if ( __collate_range_cmp(table, start, i) <= 0
|
|
|
|
&& __collate_range_cmp(table, i, finish) <= 0
|
1997-04-04 19:40:49 +00:00
|
|
|
)
|
2004-07-12 07:35:59 +00:00
|
|
|
CHadd(p, cs, i);
|
1997-04-04 19:40:49 +00:00
|
|
|
}
|
1996-08-11 16:08:17 +00:00
|
|
|
}
|
|
|
|
}
|
1994-05-27 05:00:24 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_b_cclass - parse a character-class name and deal with it
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void p_b_cclass(struct parse *p, cset *cs);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
p_b_cclass(struct parse *p, cset *cs)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
char *sp = p->next;
|
|
|
|
size_t len;
|
2004-07-12 07:35:59 +00:00
|
|
|
wctype_t wct;
|
|
|
|
char clname[16];
|
1994-05-27 05:00:24 +00:00
|
|
|
|
1996-08-11 11:42:03 +00:00
|
|
|
while (MORE() && isalpha((uch)PEEK()))
|
1994-05-27 05:00:24 +00:00
|
|
|
NEXT();
|
|
|
|
len = p->next - sp;
|
2004-07-12 07:35:59 +00:00
|
|
|
if (len >= sizeof(clname) - 1) {
|
1994-05-27 05:00:24 +00:00
|
|
|
SETERROR(REG_ECTYPE);
|
|
|
|
return;
|
|
|
|
}
|
2004-07-12 07:35:59 +00:00
|
|
|
memcpy(clname, sp, len);
|
|
|
|
clname[len] = '\0';
|
|
|
|
if ((wct = wctype(clname)) == 0) {
|
|
|
|
SETERROR(REG_ECTYPE);
|
|
|
|
return;
|
1996-08-11 11:42:03 +00:00
|
|
|
}
|
2004-07-12 07:35:59 +00:00
|
|
|
CHaddtype(p, cs, wct);
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_b_eclass - parse an equivalence-class name and deal with it
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void p_b_eclass(struct parse *p, cset *cs);
|
1994-05-27 05:00:24 +00:00
|
|
|
*
|
|
|
|
* This implementation is incomplete. xxx
|
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
p_b_eclass(struct parse *p, cset *cs)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2004-07-12 07:35:59 +00:00
|
|
|
wint_t c;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
c = p_b_coll_elem(p, '=');
|
2004-07-12 07:35:59 +00:00
|
|
|
CHadd(p, cs, c);
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
|
2011-11-10 01:44:05 +00:00
|
|
|
== static wint_t p_b_symbol(struct parse *p);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
2004-07-12 07:35:59 +00:00
|
|
|
static wint_t /* value of symbol */
|
2007-06-11 03:05:54 +00:00
|
|
|
p_b_symbol(struct parse *p)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2004-07-12 07:35:59 +00:00
|
|
|
wint_t value;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(MORE(), REG_EBRACK);
|
1994-05-27 05:00:24 +00:00
|
|
|
if (!EATTWO('[', '.'))
|
2004-07-12 07:35:59 +00:00
|
|
|
return(WGETNEXT());
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
/* collating symbol */
|
|
|
|
value = p_b_coll_elem(p, '.');
|
1996-07-12 18:57:58 +00:00
|
|
|
(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
|
1994-05-27 05:00:24 +00:00
|
|
|
return(value);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- p_b_coll_elem - parse a collating-element name and look it up
|
2011-11-10 01:44:05 +00:00
|
|
|
== static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
2004-07-12 07:35:59 +00:00
|
|
|
static wint_t /* value of collating element */
|
2007-06-11 03:05:54 +00:00
|
|
|
p_b_coll_elem(struct parse *p,
|
|
|
|
wint_t endc) /* name ended by endc,']' */
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
char *sp = p->next;
|
|
|
|
struct cname *cp;
|
|
|
|
int len;
|
2004-07-12 07:35:59 +00:00
|
|
|
mbstate_t mbs;
|
|
|
|
wchar_t wc;
|
|
|
|
size_t clen;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
while (MORE() && !SEETWO(endc, ']'))
|
|
|
|
NEXT();
|
|
|
|
if (!MORE()) {
|
|
|
|
SETERROR(REG_EBRACK);
|
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
len = p->next - sp;
|
|
|
|
for (cp = cnames; cp->name != NULL; cp++)
|
|
|
|
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
|
|
|
|
return(cp->code); /* known name */
|
2004-07-12 07:35:59 +00:00
|
|
|
memset(&mbs, 0, sizeof(mbs));
|
|
|
|
if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
|
|
|
|
return (wc); /* single character */
|
|
|
|
else if (clen == (size_t)-1 || clen == (size_t)-2)
|
|
|
|
SETERROR(REG_ILLSEQ);
|
|
|
|
else
|
|
|
|
SETERROR(REG_ECOLLATE); /* neither */
|
1994-05-27 05:00:24 +00:00
|
|
|
return(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- othercase - return the case counterpart of an alphabetic
|
2011-11-10 01:44:05 +00:00
|
|
|
== static wint_t othercase(wint_t ch);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
2004-07-12 07:35:59 +00:00
|
|
|
static wint_t /* if no counterpart, return ch */
|
2007-06-11 03:05:54 +00:00
|
|
|
othercase(wint_t ch)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2004-07-12 07:35:59 +00:00
|
|
|
assert(iswalpha(ch));
|
|
|
|
if (iswupper(ch))
|
|
|
|
return(towlower(ch));
|
|
|
|
else if (iswlower(ch))
|
|
|
|
return(towupper(ch));
|
1994-05-27 05:00:24 +00:00
|
|
|
else /* peculiar, but could happen */
|
|
|
|
return(ch);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- bothcases - emit a dualcase version of a two-case character
|
2011-11-10 01:44:05 +00:00
|
|
|
== static void bothcases(struct parse *p, wint_t ch);
|
1994-05-27 05:00:24 +00:00
|
|
|
*
|
|
|
|
* Boy, is this implementation ever a kludge...
|
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
bothcases(struct parse *p, wint_t ch)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
char *oldnext = p->next;
|
|
|
|
char *oldend = p->end;
|
2004-07-12 07:35:59 +00:00
|
|
|
char bracket[3 + MB_LEN_MAX];
|
|
|
|
size_t n;
|
|
|
|
mbstate_t mbs;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
assert(othercase(ch) != ch); /* p_bracket() would recurse */
|
|
|
|
p->next = bracket;
|
2004-07-12 07:35:59 +00:00
|
|
|
memset(&mbs, 0, sizeof(mbs));
|
|
|
|
n = wcrtomb(bracket, ch, &mbs);
|
|
|
|
assert(n != (size_t)-1);
|
|
|
|
bracket[n] = ']';
|
|
|
|
bracket[n + 1] = '\0';
|
|
|
|
p->end = bracket+n+1;
|
1994-05-27 05:00:24 +00:00
|
|
|
p_bracket(p);
|
2004-07-12 07:35:59 +00:00
|
|
|
assert(p->next == p->end);
|
1994-05-27 05:00:24 +00:00
|
|
|
p->next = oldnext;
|
|
|
|
p->end = oldend;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- ordinary - emit an ordinary character
|
2011-11-10 01:44:05 +00:00
|
|
|
== static void ordinary(struct parse *p, wint_t ch);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
ordinary(struct parse *p, wint_t ch)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2004-07-12 07:35:59 +00:00
|
|
|
cset *cs;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
2004-07-12 07:35:59 +00:00
|
|
|
if ((p->g->cflags®_ICASE) && iswalpha(ch) && othercase(ch) != ch)
|
1994-05-27 05:00:24 +00:00
|
|
|
bothcases(p, ch);
|
2004-07-12 07:35:59 +00:00
|
|
|
else if ((ch & OPDMASK) == ch)
|
|
|
|
EMIT(OCHAR, ch);
|
|
|
|
else {
|
|
|
|
/*
|
|
|
|
* Kludge: character is too big to fit into an OCHAR operand.
|
|
|
|
* Emit a singleton set.
|
|
|
|
*/
|
|
|
|
if ((cs = allocset(p)) == NULL)
|
|
|
|
return;
|
|
|
|
CHadd(p, cs, ch);
|
|
|
|
EMIT(OANYOF, (int)(cs - p->g->sets));
|
|
|
|
}
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- nonnewline - emit REG_NEWLINE version of OANY
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void nonnewline(struct parse *p);
|
1994-05-27 05:00:24 +00:00
|
|
|
*
|
|
|
|
* Boy, is this implementation ever a kludge...
|
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
nonnewline(struct parse *p)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
char *oldnext = p->next;
|
|
|
|
char *oldend = p->end;
|
1994-05-27 05:00:24 +00:00
|
|
|
char bracket[4];
|
|
|
|
|
|
|
|
p->next = bracket;
|
|
|
|
p->end = bracket+3;
|
|
|
|
bracket[0] = '^';
|
|
|
|
bracket[1] = '\n';
|
|
|
|
bracket[2] = ']';
|
|
|
|
bracket[3] = '\0';
|
|
|
|
p_bracket(p);
|
|
|
|
assert(p->next == bracket+3);
|
|
|
|
p->next = oldnext;
|
|
|
|
p->end = oldend;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- repeat - generate code for a bounded repetition, recursively if needed
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void repeat(struct parse *p, sopno start, int from, int to);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
repeat(struct parse *p,
|
|
|
|
sopno start, /* operand from here to end of strip */
|
|
|
|
int from, /* repeated from this number */
|
|
|
|
int to) /* to this number of times (maybe INFINITY) */
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
sopno finish = HERE();
|
1994-05-27 05:00:24 +00:00
|
|
|
# define N 2
|
|
|
|
# define INF 3
|
|
|
|
# define REP(f, t) ((f)*8 + (t))
|
|
|
|
# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
|
2002-03-21 18:49:23 +00:00
|
|
|
sopno copy;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
if (p->error != 0) /* head off possible runaway recursion */
|
|
|
|
return;
|
|
|
|
|
|
|
|
assert(from <= to);
|
|
|
|
|
|
|
|
switch (REP(MAP(from), MAP(to))) {
|
|
|
|
case REP(0, 0): /* must be user doing this */
|
|
|
|
DROP(finish-start); /* drop the operand */
|
|
|
|
break;
|
|
|
|
case REP(0, 1): /* as x{1,1}? */
|
|
|
|
case REP(0, N): /* as x{1,n}? */
|
|
|
|
case REP(0, INF): /* as x{1,}? */
|
|
|
|
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
|
|
|
|
INSERT(OCH_, start); /* offset is wrong... */
|
|
|
|
repeat(p, start+1, 1, to);
|
|
|
|
ASTERN(OOR1, start);
|
|
|
|
AHEAD(start); /* ... fix it */
|
|
|
|
EMIT(OOR2, 0);
|
|
|
|
AHEAD(THERE());
|
|
|
|
ASTERN(O_CH, THERETHERE());
|
|
|
|
break;
|
|
|
|
case REP(1, 1): /* trivial case */
|
|
|
|
/* done */
|
|
|
|
break;
|
|
|
|
case REP(1, N): /* as x?x{1,n-1} */
|
|
|
|
/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
|
|
|
|
INSERT(OCH_, start);
|
|
|
|
ASTERN(OOR1, start);
|
|
|
|
AHEAD(start);
|
|
|
|
EMIT(OOR2, 0); /* offset very wrong... */
|
|
|
|
AHEAD(THERE()); /* ...so fix it */
|
|
|
|
ASTERN(O_CH, THERETHERE());
|
|
|
|
copy = dupl(p, start+1, finish+1);
|
|
|
|
assert(copy == finish+4);
|
|
|
|
repeat(p, copy, 1, to-1);
|
|
|
|
break;
|
|
|
|
case REP(1, INF): /* as x+ */
|
|
|
|
INSERT(OPLUS_, start);
|
|
|
|
ASTERN(O_PLUS, start);
|
|
|
|
break;
|
|
|
|
case REP(N, N): /* as xx{m-1,n-1} */
|
|
|
|
copy = dupl(p, start, finish);
|
|
|
|
repeat(p, copy, from-1, to-1);
|
|
|
|
break;
|
|
|
|
case REP(N, INF): /* as xx{n-1,INF} */
|
|
|
|
copy = dupl(p, start, finish);
|
|
|
|
repeat(p, copy, from-1, to);
|
|
|
|
break;
|
|
|
|
default: /* "can't happen" */
|
|
|
|
SETERROR(REG_ASSERT); /* just in case */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-07-12 07:35:59 +00:00
|
|
|
/*
|
|
|
|
- wgetnext - helper function for WGETNEXT() macro. Gets the next wide
|
|
|
|
- character from the parse struct, signals a REG_ILLSEQ error if the
|
|
|
|
- character can't be converted. Returns the number of bytes consumed.
|
|
|
|
*/
|
|
|
|
static wint_t
|
2007-06-11 03:05:54 +00:00
|
|
|
wgetnext(struct parse *p)
|
2004-07-12 07:35:59 +00:00
|
|
|
{
|
|
|
|
mbstate_t mbs;
|
|
|
|
wchar_t wc;
|
|
|
|
size_t n;
|
|
|
|
|
|
|
|
memset(&mbs, 0, sizeof(mbs));
|
|
|
|
n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
|
|
|
|
if (n == (size_t)-1 || n == (size_t)-2) {
|
|
|
|
SETERROR(REG_ILLSEQ);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
if (n == 0)
|
|
|
|
n = 1;
|
|
|
|
p->next += n;
|
|
|
|
return (wc);
|
|
|
|
}
|
|
|
|
|
1994-05-27 05:00:24 +00:00
|
|
|
/*
|
|
|
|
- seterr - set an error condition
|
2002-03-21 18:49:23 +00:00
|
|
|
== static int seterr(struct parse *p, int e);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static int /* useless but makes type checking happy */
|
2007-06-11 03:05:54 +00:00
|
|
|
seterr(struct parse *p, int e)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
|
|
|
if (p->error == 0) /* keep earliest error condition */
|
|
|
|
p->error = e;
|
|
|
|
p->next = nuls; /* try to bring things to a halt */
|
|
|
|
p->end = nuls;
|
|
|
|
return(0); /* make the return value well-defined */
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- allocset - allocate a set of characters for []
|
2002-03-21 18:49:23 +00:00
|
|
|
== static cset *allocset(struct parse *p);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static cset *
|
2007-06-11 03:05:54 +00:00
|
|
|
allocset(struct parse *p)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2004-07-12 07:35:59 +00:00
|
|
|
cset *cs, *ncs;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
2004-07-12 07:35:59 +00:00
|
|
|
ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
|
|
|
|
if (ncs == NULL) {
|
|
|
|
SETERROR(REG_ESPACE);
|
|
|
|
return (NULL);
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
2004-07-12 07:35:59 +00:00
|
|
|
p->g->sets = ncs;
|
|
|
|
cs = &p->g->sets[p->g->ncsets++];
|
|
|
|
memset(cs, 0, sizeof(*cs));
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
return(cs);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- freeset - free a now-unused set
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void freeset(struct parse *p, cset *cs);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
freeset(struct parse *p, cset *cs)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
cset *top = &p->g->sets[p->g->ncsets];
|
1994-05-27 05:00:24 +00:00
|
|
|
|
2004-07-12 07:35:59 +00:00
|
|
|
free(cs->wides);
|
|
|
|
free(cs->ranges);
|
|
|
|
free(cs->types);
|
|
|
|
memset(cs, 0, sizeof(*cs));
|
1994-05-27 05:00:24 +00:00
|
|
|
if (cs == top-1) /* recover only the easy case */
|
|
|
|
p->g->ncsets--;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2004-07-12 07:35:59 +00:00
|
|
|
- singleton - Determine whether a set contains only one character,
|
|
|
|
- returning it if so, otherwise returning OUT.
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
2004-07-12 07:35:59 +00:00
|
|
|
static wint_t
|
2007-06-11 03:05:54 +00:00
|
|
|
singleton(cset *cs)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2004-07-12 07:35:59 +00:00
|
|
|
wint_t i, s, n;
|
|
|
|
|
|
|
|
for (i = n = 0; i < NC; i++)
|
|
|
|
if (CHIN(cs, i)) {
|
|
|
|
n++;
|
|
|
|
s = i;
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
2004-07-12 07:35:59 +00:00
|
|
|
if (n == 1)
|
|
|
|
return (s);
|
|
|
|
if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
|
|
|
|
cs->icase == 0)
|
|
|
|
return (cs->wides[0]);
|
|
|
|
/* Don't bother handling the other cases. */
|
|
|
|
return (OUT);
|
|
|
|
}
|
1994-05-27 05:00:24 +00:00
|
|
|
|
2004-07-12 07:35:59 +00:00
|
|
|
/*
|
|
|
|
- CHadd - add character to character set.
|
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
CHadd(struct parse *p, cset *cs, wint_t ch)
|
2004-07-12 07:35:59 +00:00
|
|
|
{
|
2004-09-05 08:30:42 +00:00
|
|
|
wint_t nch, *newwides;
|
2004-07-12 07:35:59 +00:00
|
|
|
assert(ch >= 0);
|
2004-09-05 08:30:42 +00:00
|
|
|
if (ch < NC)
|
2004-07-12 07:35:59 +00:00
|
|
|
cs->bmp[ch >> 3] |= 1 << (ch & 7);
|
2004-09-05 08:30:42 +00:00
|
|
|
else {
|
2004-07-12 07:35:59 +00:00
|
|
|
newwides = realloc(cs->wides, (cs->nwides + 1) *
|
|
|
|
sizeof(*cs->wides));
|
|
|
|
if (newwides == NULL) {
|
|
|
|
SETERROR(REG_ESPACE);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
cs->wides = newwides;
|
|
|
|
cs->wides[cs->nwides++] = ch;
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
2004-09-05 08:30:42 +00:00
|
|
|
if (cs->icase) {
|
|
|
|
if ((nch = towlower(ch)) < NC)
|
|
|
|
cs->bmp[nch >> 3] |= 1 << (nch & 7);
|
|
|
|
if ((nch = towupper(ch)) < NC)
|
|
|
|
cs->bmp[nch >> 3] |= 1 << (nch & 7);
|
|
|
|
}
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2004-07-12 07:35:59 +00:00
|
|
|
- CHaddrange - add all characters in the range [min,max] to a character set.
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
2004-07-12 07:35:59 +00:00
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2004-07-12 07:35:59 +00:00
|
|
|
crange *newranges;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
2004-07-12 07:35:59 +00:00
|
|
|
for (; min < NC && min <= max; min++)
|
|
|
|
CHadd(p, cs, min);
|
|
|
|
if (min >= max)
|
|
|
|
return;
|
|
|
|
newranges = realloc(cs->ranges, (cs->nranges + 1) *
|
|
|
|
sizeof(*cs->ranges));
|
|
|
|
if (newranges == NULL) {
|
|
|
|
SETERROR(REG_ESPACE);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
cs->ranges = newranges;
|
|
|
|
cs->ranges[cs->nranges].min = min;
|
2013-03-01 23:26:13 +00:00
|
|
|
cs->ranges[cs->nranges].max = max;
|
2004-07-12 07:35:59 +00:00
|
|
|
cs->nranges++;
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2004-07-12 07:35:59 +00:00
|
|
|
- CHaddtype - add all characters of a certain type to a character set.
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
2004-07-12 07:35:59 +00:00
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
CHaddtype(struct parse *p, cset *cs, wctype_t wct)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2004-07-12 07:35:59 +00:00
|
|
|
wint_t i;
|
|
|
|
wctype_t *newtypes;
|
|
|
|
|
2004-09-05 08:30:42 +00:00
|
|
|
for (i = 0; i < NC; i++)
|
2004-07-12 07:35:59 +00:00
|
|
|
if (iswctype(i, wct))
|
|
|
|
CHadd(p, cs, i);
|
|
|
|
newtypes = realloc(cs->types, (cs->ntypes + 1) *
|
|
|
|
sizeof(*cs->types));
|
|
|
|
if (newtypes == NULL) {
|
|
|
|
SETERROR(REG_ESPACE);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
cs->types = newtypes;
|
|
|
|
cs->types[cs->ntypes++] = wct;
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- dupl - emit a duplicate of a bunch of sops
|
2002-03-21 18:49:23 +00:00
|
|
|
== static sopno dupl(struct parse *p, sopno start, sopno finish);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static sopno /* start of duplicate */
|
2007-06-11 03:05:54 +00:00
|
|
|
dupl(struct parse *p,
|
|
|
|
sopno start, /* from here */
|
|
|
|
sopno finish) /* to this less one */
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
sopno ret = HERE();
|
|
|
|
sopno len = finish - start;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
assert(finish >= start);
|
|
|
|
if (len == 0)
|
|
|
|
return(ret);
|
2011-11-10 01:44:05 +00:00
|
|
|
if (!enlarge(p, p->ssize + len)) /* this many unexpected additions */
|
|
|
|
return(ret);
|
1994-05-27 05:00:24 +00:00
|
|
|
(void) memcpy((char *)(p->strip + p->slen),
|
|
|
|
(char *)(p->strip + start), (size_t)len*sizeof(sop));
|
|
|
|
p->slen += len;
|
|
|
|
return(ret);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- doemit - emit a strip operator
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void doemit(struct parse *p, sop op, size_t opnd);
|
1994-05-27 05:00:24 +00:00
|
|
|
*
|
|
|
|
* It might seem better to implement this as a macro with a function as
|
|
|
|
* hard-case backup, but it's just too big and messy unless there are
|
|
|
|
* some changes to the data structures. Maybe later.
|
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
doemit(struct parse *p, sop op, size_t opnd)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
|
|
|
/* avoid making error situations worse */
|
|
|
|
if (p->error != 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* deal with oversize operands ("can't happen", more or less) */
|
|
|
|
assert(opnd < 1<<OPSHIFT);
|
|
|
|
|
|
|
|
/* deal with undersized strip */
|
|
|
|
if (p->slen >= p->ssize)
|
2011-11-10 01:44:05 +00:00
|
|
|
if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */
|
|
|
|
return;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
/* finally, it's all reduced to the easy case */
|
|
|
|
p->strip[p->slen++] = SOP(op, opnd);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- doinsert - insert a sop into the strip
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
sopno sn;
|
|
|
|
sop s;
|
|
|
|
int i;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
/* avoid making error situations worse */
|
|
|
|
if (p->error != 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
sn = HERE();
|
|
|
|
EMIT(op, opnd); /* do checks, ensure space */
|
|
|
|
assert(HERE() == sn+1);
|
|
|
|
s = p->strip[sn];
|
|
|
|
|
|
|
|
/* adjust paren pointers */
|
|
|
|
assert(pos > 0);
|
|
|
|
for (i = 1; i < NPAREN; i++) {
|
|
|
|
if (p->pbegin[i] >= pos) {
|
|
|
|
p->pbegin[i]++;
|
|
|
|
}
|
|
|
|
if (p->pend[i] >= pos) {
|
|
|
|
p->pend[i]++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
|
|
|
|
(HERE()-pos-1)*sizeof(sop));
|
|
|
|
p->strip[pos] = s;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- dofwd - complete a forward reference
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void dofwd(struct parse *p, sopno pos, sop value);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
dofwd(struct parse *p, sopno pos, sop value)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
|
|
|
/* avoid making error situations worse */
|
|
|
|
if (p->error != 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
assert(value < 1<<OPSHIFT);
|
|
|
|
p->strip[pos] = OP(p->strip[pos]) | value;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- enlarge - enlarge the strip
|
2011-11-10 01:44:05 +00:00
|
|
|
== static int enlarge(struct parse *p, sopno size);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
2011-11-10 01:44:05 +00:00
|
|
|
static int
|
2007-06-11 03:05:54 +00:00
|
|
|
enlarge(struct parse *p, sopno size)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
sop *sp;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
if (p->ssize >= size)
|
2011-11-10 01:44:05 +00:00
|
|
|
return 1;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
sp = (sop *)realloc(p->strip, size*sizeof(sop));
|
|
|
|
if (sp == NULL) {
|
|
|
|
SETERROR(REG_ESPACE);
|
2011-11-10 01:44:05 +00:00
|
|
|
return 0;
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
p->strip = sp;
|
|
|
|
p->ssize = size;
|
2011-11-10 01:44:05 +00:00
|
|
|
return 1;
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- stripsnug - compact the strip
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void stripsnug(struct parse *p, struct re_guts *g);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
stripsnug(struct parse *p, struct re_guts *g)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
|
|
|
g->nstates = p->slen;
|
|
|
|
g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
|
|
|
|
if (g->strip == NULL) {
|
|
|
|
SETERROR(REG_ESPACE);
|
|
|
|
g->strip = p->strip;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- findmust - fill in must and mlen with longest mandatory literal string
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void findmust(struct parse *p, struct re_guts *g);
|
1994-05-27 05:00:24 +00:00
|
|
|
*
|
|
|
|
* This algorithm could do fancy things like analyzing the operands of |
|
|
|
|
* for common subsequences. Someday. This code is simple and finds most
|
|
|
|
* of the interesting cases.
|
|
|
|
*
|
|
|
|
* Note that must and mlen got initialized during setup.
|
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
findmust(struct parse *p, struct re_guts *g)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
sop *scan;
|
1994-05-27 05:00:24 +00:00
|
|
|
sop *start;
|
2002-03-21 18:49:23 +00:00
|
|
|
sop *newstart;
|
|
|
|
sopno newlen;
|
|
|
|
sop s;
|
|
|
|
char *cp;
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
int offset;
|
2004-07-12 07:35:59 +00:00
|
|
|
char buf[MB_LEN_MAX];
|
|
|
|
size_t clen;
|
|
|
|
mbstate_t mbs;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
/* avoid making error situations worse */
|
|
|
|
if (p->error != 0)
|
|
|
|
return;
|
|
|
|
|
2004-07-12 07:35:59 +00:00
|
|
|
/*
|
|
|
|
* It's not generally safe to do a ``char'' substring search on
|
|
|
|
* multibyte character strings, but it's safe for at least
|
|
|
|
* UTF-8 (see RFC 3629).
|
|
|
|
*/
|
|
|
|
if (MB_CUR_MAX > 1 &&
|
|
|
|
strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
|
|
|
|
return;
|
|
|
|
|
1994-05-27 05:00:24 +00:00
|
|
|
/* find the longest OCHAR sequence in strip */
|
|
|
|
newlen = 0;
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
offset = 0;
|
|
|
|
g->moffset = 0;
|
1994-05-27 05:00:24 +00:00
|
|
|
scan = g->strip + 1;
|
|
|
|
do {
|
|
|
|
s = *scan++;
|
|
|
|
switch (OP(s)) {
|
|
|
|
case OCHAR: /* sequence member */
|
2004-07-12 07:35:59 +00:00
|
|
|
if (newlen == 0) { /* new sequence */
|
|
|
|
memset(&mbs, 0, sizeof(mbs));
|
1994-05-27 05:00:24 +00:00
|
|
|
newstart = scan - 1;
|
2004-07-12 07:35:59 +00:00
|
|
|
}
|
|
|
|
clen = wcrtomb(buf, OPND(s), &mbs);
|
|
|
|
if (clen == (size_t)-1)
|
|
|
|
goto toohard;
|
|
|
|
newlen += clen;
|
1994-05-27 05:00:24 +00:00
|
|
|
break;
|
|
|
|
case OPLUS_: /* things that don't break one */
|
|
|
|
case OLPAREN:
|
|
|
|
case ORPAREN:
|
|
|
|
break;
|
|
|
|
case OQUEST_: /* things that must be skipped */
|
|
|
|
case OCH_:
|
2004-07-11 05:58:31 +00:00
|
|
|
offset = altoffset(scan, offset);
|
1994-05-27 05:00:24 +00:00
|
|
|
scan--;
|
|
|
|
do {
|
|
|
|
scan += OPND(s);
|
|
|
|
s = *scan;
|
|
|
|
/* assert() interferes w debug printouts */
|
|
|
|
if (OP(s) != O_QUEST && OP(s) != O_CH &&
|
|
|
|
OP(s) != OOR2) {
|
|
|
|
g->iflags |= BAD;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} while (OP(s) != O_QUEST && OP(s) != O_CH);
|
2002-08-25 13:10:45 +00:00
|
|
|
/* FALLTHROUGH */
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
case OBOW: /* things that break a sequence */
|
|
|
|
case OEOW:
|
|
|
|
case OBOL:
|
|
|
|
case OEOL:
|
|
|
|
case O_QUEST:
|
|
|
|
case O_CH:
|
|
|
|
case OEND:
|
1994-05-27 05:00:24 +00:00
|
|
|
if (newlen > g->mlen) { /* ends one */
|
|
|
|
start = newstart;
|
|
|
|
g->mlen = newlen;
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
if (offset > -1) {
|
|
|
|
g->moffset += offset;
|
|
|
|
offset = newlen;
|
|
|
|
} else
|
|
|
|
g->moffset = offset;
|
|
|
|
} else {
|
|
|
|
if (offset > -1)
|
|
|
|
offset += newlen;
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
newlen = 0;
|
|
|
|
break;
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
case OANY:
|
|
|
|
if (newlen > g->mlen) { /* ends one */
|
|
|
|
start = newstart;
|
|
|
|
g->mlen = newlen;
|
|
|
|
if (offset > -1) {
|
|
|
|
g->moffset += offset;
|
|
|
|
offset = newlen;
|
|
|
|
} else
|
|
|
|
g->moffset = offset;
|
|
|
|
} else {
|
|
|
|
if (offset > -1)
|
|
|
|
offset += newlen;
|
|
|
|
}
|
|
|
|
if (offset > -1)
|
|
|
|
offset++;
|
|
|
|
newlen = 0;
|
|
|
|
break;
|
|
|
|
case OANYOF: /* may or may not invalidate offset */
|
|
|
|
/* First, everything as OANY */
|
|
|
|
if (newlen > g->mlen) { /* ends one */
|
|
|
|
start = newstart;
|
|
|
|
g->mlen = newlen;
|
|
|
|
if (offset > -1) {
|
|
|
|
g->moffset += offset;
|
|
|
|
offset = newlen;
|
|
|
|
} else
|
|
|
|
g->moffset = offset;
|
|
|
|
} else {
|
|
|
|
if (offset > -1)
|
|
|
|
offset += newlen;
|
|
|
|
}
|
|
|
|
if (offset > -1)
|
|
|
|
offset++;
|
|
|
|
newlen = 0;
|
|
|
|
break;
|
2004-07-12 07:35:59 +00:00
|
|
|
toohard:
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
default:
|
|
|
|
/* Anything here makes it impossible or too hard
|
|
|
|
* to calculate the offset -- so we give up;
|
|
|
|
* save the last known good offset, in case the
|
|
|
|
* must sequence doesn't occur later.
|
|
|
|
*/
|
|
|
|
if (newlen > g->mlen) { /* ends one */
|
|
|
|
start = newstart;
|
|
|
|
g->mlen = newlen;
|
|
|
|
if (offset > -1)
|
|
|
|
g->moffset += offset;
|
|
|
|
else
|
|
|
|
g->moffset = offset;
|
|
|
|
}
|
|
|
|
offset = -1;
|
|
|
|
newlen = 0;
|
|
|
|
break;
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
} while (OP(s) != OEND);
|
|
|
|
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
if (g->mlen == 0) { /* there isn't one */
|
|
|
|
g->moffset = -1;
|
1994-05-27 05:00:24 +00:00
|
|
|
return;
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
}
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
/* turn it into a character string */
|
|
|
|
g->must = malloc((size_t)g->mlen + 1);
|
|
|
|
if (g->must == NULL) { /* argh; just forget it */
|
|
|
|
g->mlen = 0;
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
g->moffset = -1;
|
1994-05-27 05:00:24 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
cp = g->must;
|
|
|
|
scan = start;
|
2004-07-12 07:35:59 +00:00
|
|
|
memset(&mbs, 0, sizeof(mbs));
|
|
|
|
while (cp < g->must + g->mlen) {
|
1994-05-27 05:00:24 +00:00
|
|
|
while (OP(s = *scan++) != OCHAR)
|
|
|
|
continue;
|
2004-07-12 07:35:59 +00:00
|
|
|
clen = wcrtomb(cp, OPND(s), &mbs);
|
|
|
|
assert(clen != (size_t)-1);
|
|
|
|
cp += clen;
|
1994-05-27 05:00:24 +00:00
|
|
|
}
|
|
|
|
assert(cp == g->must + g->mlen);
|
|
|
|
*cp++ = '\0'; /* just on general principles */
|
|
|
|
}
|
|
|
|
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
/*
|
|
|
|
- altoffset - choose biggest offset among multiple choices
|
2004-07-11 05:58:31 +00:00
|
|
|
== static int altoffset(sop *scan, int offset);
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
*
|
|
|
|
* Compute, recursively if necessary, the largest offset among multiple
|
|
|
|
* re paths.
|
|
|
|
*/
|
|
|
|
static int
|
2007-06-11 03:05:54 +00:00
|
|
|
altoffset(sop *scan, int offset)
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
{
|
|
|
|
int largest;
|
|
|
|
int try;
|
|
|
|
sop s;
|
|
|
|
|
|
|
|
/* If we gave up already on offsets, return */
|
|
|
|
if (offset == -1)
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
largest = 0;
|
|
|
|
try = 0;
|
|
|
|
s = *scan++;
|
|
|
|
while (OP(s) != O_QUEST && OP(s) != O_CH) {
|
|
|
|
switch (OP(s)) {
|
|
|
|
case OOR1:
|
|
|
|
if (try > largest)
|
|
|
|
largest = try;
|
|
|
|
try = 0;
|
|
|
|
break;
|
|
|
|
case OQUEST_:
|
|
|
|
case OCH_:
|
2004-07-11 05:58:31 +00:00
|
|
|
try = altoffset(scan, try);
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
if (try == -1)
|
|
|
|
return -1;
|
|
|
|
scan--;
|
|
|
|
do {
|
|
|
|
scan += OPND(s);
|
|
|
|
s = *scan;
|
|
|
|
if (OP(s) != O_QUEST && OP(s) != O_CH &&
|
|
|
|
OP(s) != OOR2)
|
|
|
|
return -1;
|
|
|
|
} while (OP(s) != O_QUEST && OP(s) != O_CH);
|
2000-07-09 17:45:30 +00:00
|
|
|
/* We must skip to the next position, or we'll
|
|
|
|
* leave altoffset() too early.
|
|
|
|
*/
|
|
|
|
scan++;
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
break;
|
|
|
|
case OANYOF:
|
|
|
|
case OCHAR:
|
|
|
|
case OANY:
|
|
|
|
try++;
|
|
|
|
case OBOW:
|
|
|
|
case OEOW:
|
|
|
|
case OLPAREN:
|
|
|
|
case ORPAREN:
|
|
|
|
case OOR2:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
try = -1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (try == -1)
|
|
|
|
return -1;
|
|
|
|
s = *scan++;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (try > largest)
|
|
|
|
largest = try;
|
|
|
|
|
|
|
|
return largest+offset;
|
|
|
|
}
|
|
|
|
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
/*
|
|
|
|
- computejumps - compute char jumps for BM scan
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void computejumps(struct parse *p, struct re_guts *g);
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
*
|
|
|
|
* This algorithm assumes g->must exists and is has size greater than
|
|
|
|
* zero. It's based on the algorithm found on Computer Algorithms by
|
|
|
|
* Sara Baase.
|
|
|
|
*
|
|
|
|
* A char jump is the number of characters one needs to jump based on
|
|
|
|
* the value of the character from the text that was mismatched.
|
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
computejumps(struct parse *p, struct re_guts *g)
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
{
|
|
|
|
int ch;
|
|
|
|
int mindex;
|
|
|
|
|
|
|
|
/* Avoid making errors worse */
|
|
|
|
if (p->error != 0)
|
|
|
|
return;
|
|
|
|
|
2000-07-09 15:12:28 +00:00
|
|
|
g->charjump = (int*) malloc((NC + 1) * sizeof(int));
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
if (g->charjump == NULL) /* Not a fatal error */
|
|
|
|
return;
|
2000-07-07 07:46:36 +00:00
|
|
|
/* Adjust for signed chars, if necessary */
|
|
|
|
g->charjump = &g->charjump[-(CHAR_MIN)];
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
|
|
|
|
/* If the character does not exist in the pattern, the jump
|
|
|
|
* is equal to the number of characters in the pattern.
|
|
|
|
*/
|
2000-07-07 07:46:36 +00:00
|
|
|
for (ch = CHAR_MIN; ch < (CHAR_MAX + 1); ch++)
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
g->charjump[ch] = g->mlen;
|
|
|
|
|
|
|
|
/* If the character does exist, compute the jump that would
|
|
|
|
* take us to the last character in the pattern equal to it
|
|
|
|
* (notice that we match right to left, so that last character
|
|
|
|
* is the first one that would be matched).
|
|
|
|
*/
|
|
|
|
for (mindex = 0; mindex < g->mlen; mindex++)
|
2003-02-16 17:29:11 +00:00
|
|
|
g->charjump[(int)g->must[mindex]] = g->mlen - mindex - 1;
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
- computematchjumps - compute match jumps for BM scan
|
2002-03-21 18:49:23 +00:00
|
|
|
== static void computematchjumps(struct parse *p, struct re_guts *g);
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
*
|
|
|
|
* This algorithm assumes g->must exists and is has size greater than
|
|
|
|
* zero. It's based on the algorithm found on Computer Algorithms by
|
|
|
|
* Sara Baase.
|
|
|
|
*
|
|
|
|
* A match jump is the number of characters one needs to advance based
|
|
|
|
* on the already-matched suffix.
|
|
|
|
* Notice that all values here are minus (g->mlen-1), because of the way
|
|
|
|
* the search algorithm works.
|
|
|
|
*/
|
|
|
|
static void
|
2007-06-11 03:05:54 +00:00
|
|
|
computematchjumps(struct parse *p, struct re_guts *g)
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
{
|
|
|
|
int mindex; /* General "must" iterator */
|
|
|
|
int suffix; /* Keeps track of matching suffix */
|
|
|
|
int ssuffix; /* Keeps track of suffixes' suffix */
|
|
|
|
int* pmatches; /* pmatches[k] points to the next i
|
|
|
|
* such that i+1...mlen is a substring
|
|
|
|
* of k+1...k+mlen-i-1
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Avoid making errors worse */
|
|
|
|
if (p->error != 0)
|
|
|
|
return;
|
|
|
|
|
2000-07-09 15:12:28 +00:00
|
|
|
pmatches = (int*) malloc(g->mlen * sizeof(unsigned int));
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
if (pmatches == NULL) {
|
|
|
|
g->matchjump = NULL;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2000-07-09 15:12:28 +00:00
|
|
|
g->matchjump = (int*) malloc(g->mlen * sizeof(unsigned int));
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
if (g->matchjump == NULL) /* Not a fatal error */
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Set maximum possible jump for each character in the pattern */
|
|
|
|
for (mindex = 0; mindex < g->mlen; mindex++)
|
|
|
|
g->matchjump[mindex] = 2*g->mlen - mindex - 1;
|
|
|
|
|
|
|
|
/* Compute pmatches[] */
|
|
|
|
for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
|
|
|
|
mindex--, suffix--) {
|
|
|
|
pmatches[mindex] = suffix;
|
|
|
|
|
|
|
|
/* If a mismatch is found, interrupting the substring,
|
|
|
|
* compute the matchjump for that position. If no
|
|
|
|
* mismatch is found, then a text substring mismatched
|
|
|
|
* against the suffix will also mismatch against the
|
|
|
|
* substring.
|
|
|
|
*/
|
|
|
|
while (suffix < g->mlen
|
|
|
|
&& g->must[mindex] != g->must[suffix]) {
|
|
|
|
g->matchjump[suffix] = MIN(g->matchjump[suffix],
|
|
|
|
g->mlen - mindex - 1);
|
|
|
|
suffix = pmatches[suffix];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Compute the matchjump up to the last substring found to jump
|
|
|
|
* to the beginning of the largest must pattern prefix matching
|
|
|
|
* it's own suffix.
|
|
|
|
*/
|
|
|
|
for (mindex = 0; mindex <= suffix; mindex++)
|
|
|
|
g->matchjump[mindex] = MIN(g->matchjump[mindex],
|
|
|
|
g->mlen + suffix - mindex);
|
|
|
|
|
|
|
|
ssuffix = pmatches[suffix];
|
Enhance the optimization provided by pre-matching. Fix style bugs with
previous commits.
At the time we search the pattern for the "must" string, we now compute
the longest offset from the beginning of the pattern at which the must
string might be found. If that offset is found to be infinite (through
use of "+" or "*"), we set it to -1 to disable the heuristics applied
later.
After we are done with pre-matching, we use that offset and the point in
the text at which the must string was found to compute the earliest
point at which the pattern might be found.
Special care should be taken here. The variable "start" is passed to the
automata-processing functions fast() and slow() to indicate the point in
the text at which they should start working from. The real beginning of
the text is passed in a struct match variable m, which is used to check
for anchors. That variable, though, is initialized with "start", so we
must not adjust "start" before "m" is properly initialized.
Simple tests showed a speed increase from 100% to 400%, but they were
biased in that regexec() was called for the whole file instead of line
by line, and parenthized subexpressions were not searched for.
This change adds a single integer to the size of the "guts" structure,
and does not change the ABI.
Further improvements possible:
Since the speed increase observed here is so huge, one intuitive
optimization would be to introduce a bias in the function that computes
the "must" string so as to prefer a smaller string with a finite offset
over a larger one with an infinite offset. Tests have shown this to be a
bad idea, though, as the cost of false pre-matches far outweights the
benefits of a must offset, even in biased situations.
A number of other improvements suggest themselves, though:
* identify the cases where the pattern is identical to the must
string, and avoid entering fast() and slow() in these cases.
* compute the maximum offset from the must string to the end of
the pattern, and use that to set the point at which fast() and
slow() should give up trying to find a match, and return then
return to pre-matching.
* return all the way to pre-matching if a "match" was found and
later invalidated by back reference processing. Since back
references are evil and should be avoided anyway, this is of
little use.
2000-07-02 10:58:07 +00:00
|
|
|
while (suffix < g->mlen) {
|
2000-07-06 06:34:15 +00:00
|
|
|
while (suffix <= ssuffix && suffix < g->mlen) {
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
g->matchjump[suffix] = MIN(g->matchjump[suffix],
|
|
|
|
g->mlen + ssuffix - suffix);
|
|
|
|
suffix++;
|
|
|
|
}
|
2001-11-09 10:17:44 +00:00
|
|
|
if (suffix < g->mlen)
|
|
|
|
ssuffix = pmatches[ssuffix];
|
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left,
and jumping as many characters as viable based on the text's mismatched
character and the pattern's already matched suffix.
This typically enable us to test only a fraction of the text's characters,
but has a worse performance than the straight-forward method for small
patterns. Because of this, the BM algorithm will only be used if the
pattern size is at least 4 characters.
Notice that this pre-matching is done on the largest substring of the
regular expression that _must_ be present on the text for a succesful
match to be possible at all.
For instance, "(xyzzy|grues)" will yield a null "must" substring, and,
therefore, not benefit from the BM algorithm at all. Because of the
lack of intelligence of the algorithm that finds the "must" string,
things like "charjump|matchjump" will also yield a null string. To
optimize that, "(char|match)jump" should be used.
The setup time (at regcomp()) for the BM algorithm will most likely
outweight any benefits for one-time matches. Given the slow regex(3)
we have, this is unlikely to be even perceptible, though.
The size of a regex_t structure is increased by 2*sizeof(char*) +
256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the
regex_t's "guts", which is allocated dynamically by regcomp(). If
allocation of either of the two tables fail, the other one is freed.
In this case, the straight-forward algorithm is used for pre-matching.
Tests exercising the code path affected have shown a speed increase of
50% for "must" strings of length four or five.
API and ABI remain unchanged by this commit.
The patch submitted on the PR was not used, as it was non-functional.
PR: 14342
2000-06-29 04:48:34 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
free(pmatches);
|
|
|
|
}
|
|
|
|
|
1994-05-27 05:00:24 +00:00
|
|
|
/*
|
|
|
|
- pluscount - count + nesting
|
2002-03-21 18:49:23 +00:00
|
|
|
== static sopno pluscount(struct parse *p, struct re_guts *g);
|
1994-05-27 05:00:24 +00:00
|
|
|
*/
|
|
|
|
static sopno /* nesting depth */
|
2007-06-11 03:05:54 +00:00
|
|
|
pluscount(struct parse *p, struct re_guts *g)
|
1994-05-27 05:00:24 +00:00
|
|
|
{
|
2002-03-21 18:49:23 +00:00
|
|
|
sop *scan;
|
|
|
|
sop s;
|
|
|
|
sopno plusnest = 0;
|
|
|
|
sopno maxnest = 0;
|
1994-05-27 05:00:24 +00:00
|
|
|
|
|
|
|
if (p->error != 0)
|
|
|
|
return(0); /* there may not be an OEND */
|
|
|
|
|
|
|
|
scan = g->strip + 1;
|
|
|
|
do {
|
|
|
|
s = *scan++;
|
|
|
|
switch (OP(s)) {
|
|
|
|
case OPLUS_:
|
|
|
|
plusnest++;
|
|
|
|
break;
|
|
|
|
case O_PLUS:
|
|
|
|
if (plusnest > maxnest)
|
|
|
|
maxnest = plusnest;
|
|
|
|
plusnest--;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} while (OP(s) != OEND);
|
|
|
|
if (plusnest != 0)
|
|
|
|
g->iflags |= BAD;
|
|
|
|
return(maxnest);
|
|
|
|
}
|