libregex: implement \` and \' (begin-of-subj, end-of-subj)

These are GNU extensions, generally equivalent to ^ and $ except that the
new syntax will not match beginning of line after the first in a multi-line
expression or the end of line before absolute last in a multi-line
expression.
This commit is contained in:
Kyle Evans 2020-12-05 03:13:47 +00:00
parent 7518fb346f
commit ca53e5aedf
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=368357
4 changed files with 74 additions and 13 deletions

View File

@ -109,7 +109,7 @@ static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatc
static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int); static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast); static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast);
static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags);
#define MAX_RECURSION 100 #define MAX_RECURSION 100
#define BOL (OUT-1) #define BOL (OUT-1)
#define EOL (BOL-1) #define EOL (BOL-1)
@ -119,6 +119,10 @@ static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_
#define EOW (BOL-5) #define EOW (BOL-5)
#define BADCHAR (BOL-6) #define BADCHAR (BOL-6)
#define NONCHAR(c) ((c) <= OUT) #define NONCHAR(c) ((c) <= OUT)
/* sflags */
#define SBOS 0x0001
#define SEOS 0x0002
#ifdef REDEBUG #ifdef REDEBUG
static void print(struct match *m, const char *caption, states st, int ch, FILE *d); static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
#endif #endif
@ -457,6 +461,8 @@ dissect(struct match *m,
case OEOL: case OEOL:
case OBOW: case OBOW:
case OEOW: case OEOW:
case OBOS:
case OEOS:
break; break;
case OANY: case OANY:
case OANYOF: case OANYOF:
@ -657,6 +663,18 @@ backref(struct match *m,
if (wc == BADCHAR || !CHIN(cs, wc)) if (wc == BADCHAR || !CHIN(cs, wc))
return(NULL); return(NULL);
break; break;
case OBOS:
if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0)
{ /* yes */ }
else
return(NULL);
break;
case OEOS:
if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0)
{ /* yes */ }
else
return(NULL);
break;
case OBOL: case OBOL:
if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) || if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
(sp > m->offp && sp < m->endp && (sp > m->offp && sp < m->endp &&
@ -819,15 +837,16 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
wint_t c; wint_t c;
wint_t lastc; /* previous c */ wint_t lastc; /* previous c */
wint_t flagch; wint_t flagch;
int i; int i, sflags;
const char *matchp; /* last p at which a match ended */ const char *matchp; /* last p at which a match ended */
size_t clen; size_t clen;
sflags = 0;
AT("slow", start, stop, startst, stopst); AT("slow", start, stop, startst, stopst);
CLEAR(st); CLEAR(st);
SET1(st, startst); SET1(st, startst);
SP("sstart", st, *p); SP("sstart", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st); st = step(m->g, startst, stopst, st, NOTHING, st, sflags);
if (fast) if (fast)
ASSIGN(fresh, st); ASSIGN(fresh, st);
matchp = NULL; matchp = NULL;
@ -844,6 +863,7 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
for (;;) { for (;;) {
/* next character */ /* next character */
lastc = c; lastc = c;
sflags = 0;
if (p == m->endp) { if (p == m->endp) {
c = OUT; c = OUT;
clen = 0; clen = 0;
@ -866,9 +886,20 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
flagch = (flagch == BOL) ? BOLEOL : EOL; flagch = (flagch == BOL) ? BOLEOL : EOL;
i += m->g->neol; i += m->g->neol;
} }
if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) {
sflags |= SBOS;
/* Step one more for BOS. */
i++;
}
if (c == OUT && (m->eflags & REG_NOTEOL) == 0) {
sflags |= SEOS;
/* Step one more for EOS. */
i++;
}
if (i != 0) { if (i != 0) {
for (; i > 0; i--) for (; i > 0; i--)
st = step(m->g, startst, stopst, st, flagch, st); st = step(m->g, startst, stopst, st, flagch, st,
sflags);
SP("sboleol", st, c); SP("sboleol", st, c);
} }
@ -882,7 +913,7 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
flagch = EOW; flagch = EOW;
} }
if (flagch == BOW || flagch == EOW) { if (flagch == BOW || flagch == EOW) {
st = step(m->g, startst, stopst, st, flagch, st); st = step(m->g, startst, stopst, st, flagch, st, sflags);
SP("sboweow", st, c); SP("sboweow", st, c);
} }
@ -903,9 +934,10 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
else else
ASSIGN(st, empty); ASSIGN(st, empty);
assert(c != OUT); assert(c != OUT);
st = step(m->g, startst, stopst, tmp, c, st); st = step(m->g, startst, stopst, tmp, c, st, sflags);
SP("saft", st, c); SP("saft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags),
st));
p += clen; p += clen;
} }
@ -939,7 +971,8 @@ step(struct re_guts *g,
sopno stop, /* state after stop state within strip */ sopno stop, /* state after stop state within strip */
states bef, /* states reachable before */ states bef, /* states reachable before */
wint_t ch, /* character or NONCHAR code */ wint_t ch, /* character or NONCHAR code */
states aft) /* states already known reachable after */ states aft, /* states already known reachable after */
int sflags) /* state flags */
{ {
cset *cs; cset *cs;
sop s; sop s;
@ -960,6 +993,14 @@ step(struct re_guts *g,
if (ch == OPND(s)) if (ch == OPND(s))
FWD(aft, bef, 1); FWD(aft, bef, 1);
break; break;
case OBOS:
if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0)
FWD(aft, bef, 1);
break;
case OEOS:
if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0)
FWD(aft, bef, 1);
break;
case OBOL: case OBOL:
if (ch == BOL || ch == BOLEOL) if (ch == BOL || ch == BOLEOL)
FWD(aft, bef, 1); FWD(aft, bef, 1);

View File

@ -480,6 +480,12 @@ p_ere_exp(struct parse *p, struct branchc *bc)
if (p->gnuext) { if (p->gnuext) {
handled = 1; handled = 1;
switch (wc) { switch (wc) {
case '`':
EMIT(OBOS, 0);
break;
case '\'':
EMIT(OEOS, 0);
break;
case 'W': case 'W':
case 'w': case 'w':
case 'S': case 'S':
@ -833,6 +839,12 @@ p_simp_re(struct parse *p, struct branchc *bc)
if (p->gnuext) { if (p->gnuext) {
handled = true; handled = true;
switch (c) { switch (c) {
case BACKSL|'`':
EMIT(OBOS, 0);
break;
case BACKSL|'\'':
EMIT(OEOS, 0);
break;
case BACKSL|'W': case BACKSL|'W':
case BACKSL|'w': case BACKSL|'w':
case BACKSL|'S': case BACKSL|'S':
@ -1878,6 +1890,8 @@ findmust(struct parse *p, struct re_guts *g)
case OEOW: case OEOW:
case OBOL: case OBOL:
case OEOL: case OEOL:
case OBOS:
case OEOS:
case O_QUEST: case O_QUEST:
case O_CH: case O_CH:
case OEND: case OEND:

View File

@ -104,6 +104,8 @@ typedef unsigned long sopno;
#define O_CH (18L<<OPSHIFT) /* end choice back to OOR1 */ #define O_CH (18L<<OPSHIFT) /* end choice back to OOR1 */
#define OBOW (19L<<OPSHIFT) /* begin word - */ #define OBOW (19L<<OPSHIFT) /* begin word - */
#define OEOW (20L<<OPSHIFT) /* end word - */ #define OEOW (20L<<OPSHIFT) /* end word - */
#define OBOS (21L<<OPSHIFT) /* begin subj. - */
#define OEOS (22L<<OPSHIFT) /* end subj. - */
/* /*
* Structures for [] character-set representation. * Structures for [] character-set representation.

View File

@ -25,8 +25,12 @@ a\|b\|c b abc a
#\B[abc]\B & <abc> b #\B[abc]\B & <abc> b
#\B[abc]+ - <abc> bc #\B[abc]+ - <abc> bc
#\B[abc]\+ b <abc> bc #\B[abc]\+ b <abc> bc
#\`abc\' & abc abc \`abc & abc abc
#\`.+\' - abNc abNc abc\' & abc abc
#\`.\+\' b abNc abNc \`abc\' & abc abc
#(\`a) - Na \`.+\' - abNc abNc
#(a\') - aN \`.\+\' b abNc abNc
(\`a) - Na
(a\`) - aN
(a\') - aN
(\'a) - Na