libregex: implement \b and \B (word boundary, not word boundary)
This is the last of the needed GNU expressions before we can unleash bsdgrep by default. \b is effectively an agnostic equivalent of \< and \>, while \B will match every space that isn't making a transition from nonchar -> char or char -> nonchar.
This commit is contained in:
parent
ca53e5aedf
commit
6b986646d4
@ -5,7 +5,7 @@ a\*c & a*c a*c
|
|||||||
a\\b & a\b a\b
|
a\\b & a\b a\b
|
||||||
a\\\*b & a\*b a\*b
|
a\\\*b & a\*b a\*b
|
||||||
# Begin FreeBSD
|
# Begin FreeBSD
|
||||||
a\bc &C EESCAPE
|
a\bc & abc
|
||||||
# End FreeBSD
|
# End FreeBSD
|
||||||
a\ &C EESCAPE
|
a\ &C EESCAPE
|
||||||
a\\bc & a\bc a\bc
|
a\\bc & a\bc a\bc
|
||||||
|
@ -118,6 +118,7 @@ static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_
|
|||||||
#define BOW (BOL-4)
|
#define BOW (BOL-4)
|
||||||
#define EOW (BOL-5)
|
#define EOW (BOL-5)
|
||||||
#define BADCHAR (BOL-6)
|
#define BADCHAR (BOL-6)
|
||||||
|
#define NWBND (BOL-7)
|
||||||
#define NONCHAR(c) ((c) <= OUT)
|
#define NONCHAR(c) ((c) <= OUT)
|
||||||
/* sflags */
|
/* sflags */
|
||||||
#define SBOS 0x0001
|
#define SBOS 0x0001
|
||||||
@ -463,6 +464,8 @@ dissect(struct match *m,
|
|||||||
case OEOW:
|
case OEOW:
|
||||||
case OBOS:
|
case OBOS:
|
||||||
case OEOS:
|
case OEOS:
|
||||||
|
case OWBND:
|
||||||
|
case ONWBND:
|
||||||
break;
|
break;
|
||||||
case OANY:
|
case OANY:
|
||||||
case OANYOF:
|
case OANYOF:
|
||||||
@ -691,6 +694,21 @@ backref(struct match *m,
|
|||||||
else
|
else
|
||||||
return(NULL);
|
return(NULL);
|
||||||
break;
|
break;
|
||||||
|
case OWBND:
|
||||||
|
if (ISBOW(m, sp) || ISEOW(m, sp))
|
||||||
|
{ /* yes */ }
|
||||||
|
else
|
||||||
|
return(NULL);
|
||||||
|
break;
|
||||||
|
case ONWBND:
|
||||||
|
if (((sp == m->beginp) && !ISWORD(*sp)) ||
|
||||||
|
(sp == m->endp && !ISWORD(*(sp - 1))))
|
||||||
|
{ /* yes, beginning/end of subject */ }
|
||||||
|
else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
|
||||||
|
{ /* yes, beginning/end of subject */ }
|
||||||
|
else
|
||||||
|
return(NULL);
|
||||||
|
break;
|
||||||
case OBOW:
|
case OBOW:
|
||||||
if (ISBOW(m, sp))
|
if (ISBOW(m, sp))
|
||||||
{ /* yes */ }
|
{ /* yes */ }
|
||||||
@ -916,6 +934,17 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
|
|||||||
st = step(m->g, startst, stopst, st, flagch, st, sflags);
|
st = step(m->g, startst, stopst, st, flagch, st, sflags);
|
||||||
SP("sboweow", st, c);
|
SP("sboweow", st, c);
|
||||||
}
|
}
|
||||||
|
if (lastc != OUT && c != OUT &&
|
||||||
|
ISWORD(lastc) == ISWORD(c)) {
|
||||||
|
flagch = NWBND;
|
||||||
|
} else if ((lastc == OUT && !ISWORD(c)) ||
|
||||||
|
(c == OUT && !ISWORD(lastc))) {
|
||||||
|
flagch = NWBND;
|
||||||
|
}
|
||||||
|
if (flagch == NWBND) {
|
||||||
|
st = step(m->g, startst, stopst, st, flagch, st, sflags);
|
||||||
|
SP("snwbnd", st, c);
|
||||||
|
}
|
||||||
|
|
||||||
/* are we done? */
|
/* are we done? */
|
||||||
if (ISSET(st, stopst)) {
|
if (ISSET(st, stopst)) {
|
||||||
@ -1017,6 +1046,14 @@ step(struct re_guts *g,
|
|||||||
if (ch == EOW)
|
if (ch == EOW)
|
||||||
FWD(aft, bef, 1);
|
FWD(aft, bef, 1);
|
||||||
break;
|
break;
|
||||||
|
case OWBND:
|
||||||
|
if (ch == BOW || ch == EOW)
|
||||||
|
FWD(aft, bef, 1);
|
||||||
|
break;
|
||||||
|
case ONWBND:
|
||||||
|
if (ch == NWBND)
|
||||||
|
FWD(aft, aft, 1);
|
||||||
|
break;
|
||||||
case OANY:
|
case OANY:
|
||||||
if (!NONCHAR(ch))
|
if (!NONCHAR(ch))
|
||||||
FWD(aft, bef, 1);
|
FWD(aft, bef, 1);
|
||||||
|
@ -486,6 +486,12 @@ p_ere_exp(struct parse *p, struct branchc *bc)
|
|||||||
case '\'':
|
case '\'':
|
||||||
EMIT(OEOS, 0);
|
EMIT(OEOS, 0);
|
||||||
break;
|
break;
|
||||||
|
case 'B':
|
||||||
|
EMIT(ONWBND, 0);
|
||||||
|
break;
|
||||||
|
case 'b':
|
||||||
|
EMIT(OWBND, 0);
|
||||||
|
break;
|
||||||
case 'W':
|
case 'W':
|
||||||
case 'w':
|
case 'w':
|
||||||
case 'S':
|
case 'S':
|
||||||
@ -845,6 +851,12 @@ p_simp_re(struct parse *p, struct branchc *bc)
|
|||||||
case BACKSL|'\'':
|
case BACKSL|'\'':
|
||||||
EMIT(OEOS, 0);
|
EMIT(OEOS, 0);
|
||||||
break;
|
break;
|
||||||
|
case BACKSL|'B':
|
||||||
|
EMIT(ONWBND, 0);
|
||||||
|
break;
|
||||||
|
case BACKSL|'b':
|
||||||
|
EMIT(OWBND, 0);
|
||||||
|
break;
|
||||||
case BACKSL|'W':
|
case BACKSL|'W':
|
||||||
case BACKSL|'w':
|
case BACKSL|'w':
|
||||||
case BACKSL|'S':
|
case BACKSL|'S':
|
||||||
@ -1892,6 +1904,8 @@ findmust(struct parse *p, struct re_guts *g)
|
|||||||
case OEOL:
|
case OEOL:
|
||||||
case OBOS:
|
case OBOS:
|
||||||
case OEOS:
|
case OEOS:
|
||||||
|
case OWBND:
|
||||||
|
case ONWBND:
|
||||||
case O_QUEST:
|
case O_QUEST:
|
||||||
case O_CH:
|
case O_CH:
|
||||||
case OEND:
|
case OEND:
|
||||||
@ -2043,6 +2057,8 @@ altoffset(sop *scan, int offset)
|
|||||||
try++;
|
try++;
|
||||||
case OBOW:
|
case OBOW:
|
||||||
case OEOW:
|
case OEOW:
|
||||||
|
case OWBND:
|
||||||
|
case ONWBND:
|
||||||
case OLPAREN:
|
case OLPAREN:
|
||||||
case ORPAREN:
|
case ORPAREN:
|
||||||
case OOR2:
|
case OOR2:
|
||||||
|
@ -106,6 +106,8 @@ typedef unsigned long sopno;
|
|||||||
#define OEOW (20L<<OPSHIFT) /* end word - */
|
#define OEOW (20L<<OPSHIFT) /* end word - */
|
||||||
#define OBOS (21L<<OPSHIFT) /* begin subj. - */
|
#define OBOS (21L<<OPSHIFT) /* begin subj. - */
|
||||||
#define OEOS (22L<<OPSHIFT) /* end subj. - */
|
#define OEOS (22L<<OPSHIFT) /* end subj. - */
|
||||||
|
#define OWBND (23L<<OPSHIFT) /* word bound - */
|
||||||
|
#define ONWBND (24L<<OPSHIFT) /* not bound - */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Structures for [] character-set representation.
|
* Structures for [] character-set representation.
|
||||||
|
@ -17,14 +17,12 @@ a\|b\|c b abc a
|
|||||||
\s\+ b aSNTb SNT
|
\s\+ b aSNTb SNT
|
||||||
# Word boundaries (\b, \B, \<, \>, \`, \')
|
# Word boundaries (\b, \B, \<, \>, \`, \')
|
||||||
# (is/not boundary, start/end word, start/end subject string)
|
# (is/not boundary, start/end word, start/end subject string)
|
||||||
# Most of these are disabled for the moment, and will be re-enabled as
|
\babc\b & <abc> abc
|
||||||
# we become feature complete.
|
|
||||||
#\babc\b & <abc> abc
|
|
||||||
\<abc\> & <abc> abc
|
\<abc\> & <abc> abc
|
||||||
#\Babc\B & abc
|
\Babc\B & abc
|
||||||
#\B[abc]\B & <abc> b
|
\B[abc]\B & <abc> b
|
||||||
#\B[abc]+ - <abc> bc
|
\B[abc]+ - <abc> bc
|
||||||
#\B[abc]\+ b <abc> bc
|
\B[abc]\+ b <abc> bc
|
||||||
\`abc & abc abc
|
\`abc & abc abc
|
||||||
abc\' & abc abc
|
abc\' & abc abc
|
||||||
\`abc\' & abc abc
|
\`abc\' & abc abc
|
||||||
|
Loading…
x
Reference in New Issue
Block a user