Remove incomplete support for multi-character collating elements. Remove
unused character category calculations.
This commit is contained in:
parent
5fd437f0d8
commit
db66ea27a0
@ -106,17 +106,6 @@ static void freeset(struct parse *p, cset *cs);
|
||||
static int freezeset(struct parse *p, cset *cs);
|
||||
static int firstch(struct parse *p, cset *cs);
|
||||
static int nch(struct parse *p, cset *cs);
|
||||
static void mcadd(struct parse *p, cset *cs, char *cp) __unused;
|
||||
#if used
|
||||
static void mcsub(cset *cs, char *cp);
|
||||
static int mcin(cset *cs, char *cp);
|
||||
static char *mcfind(cset *cs, char *cp);
|
||||
#endif
|
||||
static void mcinvert(struct parse *p, cset *cs);
|
||||
static void mccase(struct parse *p, cset *cs);
|
||||
static int isinsets(struct re_guts *g, int c);
|
||||
static int samesets(struct re_guts *g, int c1, int c2);
|
||||
static void categorize(struct parse *p, struct re_guts *g);
|
||||
static sopno dupl(struct parse *p, sopno start, sopno finish);
|
||||
static void doemit(struct parse *p, sop op, size_t opnd);
|
||||
static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
|
||||
@ -124,7 +113,7 @@ static void dofwd(struct parse *p, sopno pos, sop value);
|
||||
static void enlarge(struct parse *p, sopno size);
|
||||
static void stripsnug(struct parse *p, struct re_guts *g);
|
||||
static void findmust(struct parse *p, struct re_guts *g);
|
||||
static int altoffset(sop *scan, int offset, int mccs);
|
||||
static int altoffset(sop *scan, int offset);
|
||||
static void computejumps(struct parse *p, struct re_guts *g);
|
||||
static void computematchjumps(struct parse *p, struct re_guts *g);
|
||||
static sopno pluscount(struct parse *p, struct re_guts *g);
|
||||
@ -216,8 +205,7 @@ int cflags;
|
||||
len = strlen((char *)pattern);
|
||||
|
||||
/* do the mallocs early so failure handling is easy */
|
||||
g = (struct re_guts *)malloc(sizeof(struct re_guts) +
|
||||
(NC-1)*sizeof(cat_t));
|
||||
g = (struct re_guts *)malloc(sizeof(struct re_guts));
|
||||
if (g == NULL)
|
||||
return(REG_ESPACE);
|
||||
p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
|
||||
@ -252,9 +240,6 @@ int cflags;
|
||||
g->matchjump = NULL;
|
||||
g->mlen = 0;
|
||||
g->nsub = 0;
|
||||
g->ncategories = 1; /* category 0 is "everything else" */
|
||||
g->categories = &g->catspace[-(CHAR_MIN)];
|
||||
(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
|
||||
g->backrefs = 0;
|
||||
|
||||
/* do it */
|
||||
@ -270,7 +255,6 @@ int cflags;
|
||||
g->laststate = THERE();
|
||||
|
||||
/* tidy up loose ends and fill things in */
|
||||
categorize(p, g);
|
||||
stripsnug(p, g);
|
||||
findmust(p, g);
|
||||
/* only use Boyer-Moore algorithm if the pattern is bigger
|
||||
@ -516,9 +500,7 @@ struct parse *p;
|
||||
* Giving end1 as OUT essentially eliminates the end1/end2 check.
|
||||
*
|
||||
* This implementation is a bit of a kludge, in that a trailing $ is first
|
||||
* taken as an ordinary character and then revised to be an anchor. The
|
||||
* only undesirable side effect is that '$' gets included as a character
|
||||
* category in such cases. This is fairly harmless; not worth fixing.
|
||||
* taken as an ordinary character and then revised to be an anchor.
|
||||
* The amount of lookahead needed to avoid this kludge is excessive.
|
||||
*/
|
||||
static void
|
||||
@ -739,8 +721,6 @@ struct parse *p;
|
||||
if (ci != i)
|
||||
CHadd(cs, ci);
|
||||
}
|
||||
if (cs->multis != NULL)
|
||||
mccase(p, cs);
|
||||
}
|
||||
if (invert) {
|
||||
int i;
|
||||
@ -752,12 +732,8 @@ struct parse *p;
|
||||
CHadd(cs, i);
|
||||
if (p->g->cflags®_NEWLINE)
|
||||
CHsub(cs, '\n');
|
||||
if (cs->multis != NULL)
|
||||
mcinvert(p, cs);
|
||||
}
|
||||
|
||||
assert(cs->multis == NULL); /* xxx */
|
||||
|
||||
if (nch(p, cs) == 1) { /* optimize singleton sets */
|
||||
ordinary(p, firstch(p, cs));
|
||||
freeset(p, cs);
|
||||
@ -812,7 +788,6 @@ cset *cs;
|
||||
(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
|
||||
break;
|
||||
default: /* symbol, ordinary character, or range */
|
||||
/* xxx revision needed for multichar stuff */
|
||||
start = p_b_symbol(p);
|
||||
if (SEE('-') && MORE2() && PEEK2() != ']') {
|
||||
/* range */
|
||||
@ -932,10 +907,6 @@ cset *cs;
|
||||
CHadd(cs, c);
|
||||
break;
|
||||
}
|
||||
#if 0
|
||||
for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
|
||||
MCadd(p, cs, u);
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1059,15 +1030,11 @@ ordinary(p, ch)
|
||||
struct parse *p;
|
||||
int ch;
|
||||
{
|
||||
cat_t *cap = p->g->categories;
|
||||
|
||||
if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
|
||||
bothcases(p, ch);
|
||||
else {
|
||||
else
|
||||
EMIT(OCHAR, (uch)ch);
|
||||
if (cap[ch] == 0)
|
||||
cap[ch] = p->g->ncategories++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1233,8 +1200,6 @@ struct parse *p;
|
||||
cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
|
||||
cs->mask = 1 << ((no) % CHAR_BIT);
|
||||
cs->hash = 0;
|
||||
cs->smultis = 0;
|
||||
cs->multis = NULL;
|
||||
|
||||
return(cs);
|
||||
}
|
||||
@ -1336,193 +1301,6 @@ cset *cs;
|
||||
return(n);
|
||||
}
|
||||
|
||||
/*
|
||||
- mcadd - add a collating element to a cset
|
||||
== static void mcadd(struct parse *p, cset *cs, \
|
||||
== char *cp);
|
||||
*/
|
||||
static void
|
||||
mcadd(p, cs, cp)
|
||||
struct parse *p;
|
||||
cset *cs;
|
||||
char *cp;
|
||||
{
|
||||
size_t oldend = cs->smultis;
|
||||
|
||||
cs->smultis += strlen(cp) + 1;
|
||||
if (cs->multis == NULL)
|
||||
cs->multis = malloc(cs->smultis);
|
||||
else
|
||||
cs->multis = reallocf(cs->multis, cs->smultis);
|
||||
if (cs->multis == NULL) {
|
||||
SETERROR(REG_ESPACE);
|
||||
return;
|
||||
}
|
||||
|
||||
(void) strcpy(cs->multis + oldend - 1, cp);
|
||||
cs->multis[cs->smultis - 1] = '\0';
|
||||
}
|
||||
|
||||
#if used
|
||||
/*
|
||||
- mcsub - subtract a collating element from a cset
|
||||
== static void mcsub(cset *cs, char *cp);
|
||||
*/
|
||||
static void
|
||||
mcsub(cs, cp)
|
||||
cset *cs;
|
||||
char *cp;
|
||||
{
|
||||
char *fp = mcfind(cs, cp);
|
||||
size_t len = strlen(fp);
|
||||
|
||||
assert(fp != NULL);
|
||||
(void) memmove(fp, fp + len + 1,
|
||||
cs->smultis - (fp + len + 1 - cs->multis));
|
||||
cs->smultis -= len;
|
||||
|
||||
if (cs->smultis == 0) {
|
||||
free(cs->multis);
|
||||
cs->multis = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
cs->multis = reallocf(cs->multis, cs->smultis);
|
||||
assert(cs->multis != NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
- mcin - is a collating element in a cset?
|
||||
== static int mcin(cset *cs, char *cp);
|
||||
*/
|
||||
static int
|
||||
mcin(cs, cp)
|
||||
cset *cs;
|
||||
char *cp;
|
||||
{
|
||||
return(mcfind(cs, cp) != NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
- mcfind - find a collating element in a cset
|
||||
== static char *mcfind(cset *cs, char *cp);
|
||||
*/
|
||||
static char *
|
||||
mcfind(cs, cp)
|
||||
cset *cs;
|
||||
char *cp;
|
||||
{
|
||||
char *p;
|
||||
|
||||
if (cs->multis == NULL)
|
||||
return(NULL);
|
||||
for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
|
||||
if (strcmp(cp, p) == 0)
|
||||
return(p);
|
||||
return(NULL);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
- mcinvert - invert the list of collating elements in a cset
|
||||
== static void mcinvert(struct parse *p, cset *cs);
|
||||
*
|
||||
* This would have to know the set of possibilities. Implementation
|
||||
* is deferred.
|
||||
*/
|
||||
static void
|
||||
mcinvert(p, cs)
|
||||
struct parse *p;
|
||||
cset *cs;
|
||||
{
|
||||
assert(cs->multis == NULL); /* xxx */
|
||||
}
|
||||
|
||||
/*
|
||||
- mccase - add case counterparts of the list of collating elements in a cset
|
||||
== static void mccase(struct parse *p, cset *cs);
|
||||
*
|
||||
* This would have to know the set of possibilities. Implementation
|
||||
* is deferred.
|
||||
*/
|
||||
static void
|
||||
mccase(p, cs)
|
||||
struct parse *p;
|
||||
cset *cs;
|
||||
{
|
||||
assert(cs->multis == NULL); /* xxx */
|
||||
}
|
||||
|
||||
/*
|
||||
- isinsets - is this character in any sets?
|
||||
== static int isinsets(struct re_guts *g, int c);
|
||||
*/
|
||||
static int /* predicate */
|
||||
isinsets(g, c)
|
||||
struct re_guts *g;
|
||||
int c;
|
||||
{
|
||||
uch *col;
|
||||
int i;
|
||||
int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
|
||||
unsigned uc = (uch)c;
|
||||
|
||||
for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
|
||||
if (col[uc] != 0)
|
||||
return(1);
|
||||
return(0);
|
||||
}
|
||||
|
||||
/*
|
||||
- samesets - are these two characters in exactly the same sets?
|
||||
== static int samesets(struct re_guts *g, int c1, int c2);
|
||||
*/
|
||||
static int /* predicate */
|
||||
samesets(g, c1, c2)
|
||||
struct re_guts *g;
|
||||
int c1;
|
||||
int c2;
|
||||
{
|
||||
uch *col;
|
||||
int i;
|
||||
int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
|
||||
unsigned uc1 = (uch)c1;
|
||||
unsigned uc2 = (uch)c2;
|
||||
|
||||
for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
|
||||
if (col[uc1] != col[uc2])
|
||||
return(0);
|
||||
return(1);
|
||||
}
|
||||
|
||||
/*
|
||||
- categorize - sort out character categories
|
||||
== static void categorize(struct parse *p, struct re_guts *g);
|
||||
*/
|
||||
static void
|
||||
categorize(p, g)
|
||||
struct parse *p;
|
||||
struct re_guts *g;
|
||||
{
|
||||
cat_t *cats = g->categories;
|
||||
int c;
|
||||
int c2;
|
||||
cat_t cat;
|
||||
|
||||
/* avoid making error situations worse */
|
||||
if (p->error != 0)
|
||||
return;
|
||||
|
||||
for (c = CHAR_MIN; c <= CHAR_MAX; c++)
|
||||
if (cats[c] == 0 && isinsets(g, c)) {
|
||||
cat = g->ncategories++;
|
||||
cats[c] = cat;
|
||||
for (c2 = c+1; c2 <= CHAR_MAX; c2++)
|
||||
if (cats[c2] == 0 && samesets(g, c, c2))
|
||||
cats[c2] = cat;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
- dupl - emit a duplicate of a bunch of sops
|
||||
== static sopno dupl(struct parse *p, sopno start, sopno finish);
|
||||
@ -1698,18 +1476,11 @@ struct re_guts *g;
|
||||
char *cp;
|
||||
sopno i;
|
||||
int offset;
|
||||
int cs, mccs;
|
||||
|
||||
/* avoid making error situations worse */
|
||||
if (p->error != 0)
|
||||
return;
|
||||
|
||||
/* Find out if we can handle OANYOF or not */
|
||||
mccs = 0;
|
||||
for (cs = 0; cs < g->ncsets; cs++)
|
||||
if (g->sets[cs].multis != NULL)
|
||||
mccs = 1;
|
||||
|
||||
/* find the longest OCHAR sequence in strip */
|
||||
newlen = 0;
|
||||
offset = 0;
|
||||
@ -1729,7 +1500,7 @@ struct re_guts *g;
|
||||
break;
|
||||
case OQUEST_: /* things that must be skipped */
|
||||
case OCH_:
|
||||
offset = altoffset(scan, offset, mccs);
|
||||
offset = altoffset(scan, offset);
|
||||
scan--;
|
||||
do {
|
||||
scan += OPND(s);
|
||||
@ -1797,11 +1568,6 @@ struct re_guts *g;
|
||||
if (offset > -1)
|
||||
offset++;
|
||||
newlen = 0;
|
||||
/* And, now, if we found out we can't deal with
|
||||
* it, make offset = -1.
|
||||
*/
|
||||
if (mccs)
|
||||
offset = -1;
|
||||
break;
|
||||
default:
|
||||
/* Anything here makes it impossible or too hard
|
||||
@ -1849,16 +1615,15 @@ struct re_guts *g;
|
||||
|
||||
/*
|
||||
- altoffset - choose biggest offset among multiple choices
|
||||
== static int altoffset(sop *scan, int offset, int mccs);
|
||||
== static int altoffset(sop *scan, int offset);
|
||||
*
|
||||
* Compute, recursively if necessary, the largest offset among multiple
|
||||
* re paths.
|
||||
*/
|
||||
static int
|
||||
altoffset(scan, offset, mccs)
|
||||
altoffset(scan, offset)
|
||||
sop *scan;
|
||||
int offset;
|
||||
int mccs;
|
||||
{
|
||||
int largest;
|
||||
int try;
|
||||
@ -1880,7 +1645,7 @@ int mccs;
|
||||
break;
|
||||
case OQUEST_:
|
||||
case OCH_:
|
||||
try = altoffset(scan, try, mccs);
|
||||
try = altoffset(scan, try);
|
||||
if (try == -1)
|
||||
return -1;
|
||||
scan--;
|
||||
@ -1897,8 +1662,6 @@ int mccs;
|
||||
scan++;
|
||||
break;
|
||||
case OANYOF:
|
||||
if (mccs)
|
||||
return -1;
|
||||
case OCHAR:
|
||||
case OANY:
|
||||
try++;
|
||||
|
@ -113,29 +113,16 @@ typedef long sopno;
|
||||
* The individual set therefore has both a pointer to the byte vector
|
||||
* and a mask to pick out the relevant bit of each byte. A hash code
|
||||
* simplifies testing whether two sets could be identical.
|
||||
*
|
||||
* This will get trickier for multicharacter collating elements. As
|
||||
* preliminary hooks for dealing with such things, we also carry along
|
||||
* a string of multi-character elements, and decide the size of the
|
||||
* vectors at run time.
|
||||
*/
|
||||
typedef struct {
|
||||
uch *ptr; /* -> uch [csetsize] */
|
||||
uch mask; /* bit within array */
|
||||
short hash; /* hash code */
|
||||
size_t smultis;
|
||||
char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */
|
||||
} cset;
|
||||
/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
|
||||
#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (uch)(c))
|
||||
#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (uch)(c))
|
||||
#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
|
||||
#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */
|
||||
#define MCsub(p, cs, cp) mcsub(p, cs, cp)
|
||||
#define MCin(p, cs, cp) mcin(p, cs, cp)
|
||||
|
||||
/* stuff for character categories */
|
||||
typedef unsigned char cat_t;
|
||||
|
||||
/*
|
||||
* main compiled-expression structure
|
||||
@ -158,8 +145,6 @@ struct re_guts {
|
||||
# define BAD 04 /* something wrong */
|
||||
int nbol; /* number of ^ used */
|
||||
int neol; /* number of $ used */
|
||||
int ncategories; /* how many character categories */
|
||||
cat_t *categories; /* ->catspace[-CHAR_MIN] */
|
||||
char *must; /* match must contain this string */
|
||||
int moffset; /* latest point at which must may be located */
|
||||
int *charjump; /* Boyer-Moore char jump table */
|
||||
@ -168,8 +153,6 @@ struct re_guts {
|
||||
size_t nsub; /* copy of re_nsub */
|
||||
int backrefs; /* does it use back references? */
|
||||
sopno nplus; /* how deep does it nest +s? */
|
||||
/* catspace must be last */
|
||||
cat_t catspace[1]; /* actually [NC] */
|
||||
};
|
||||
|
||||
/* misc utilities */
|
||||
|
Loading…
x
Reference in New Issue
Block a user