regex(3): Interpret many escaped ordinary characters as EESCAPE
In IEEE 1003.1-2008 [1] and earlier revisions, BRE/ERE grammar allows for any character to be escaped, but "ORD_CHAR preceded by an unescaped <backslash> character [gives undefined results]". Historically, we've interpreted an escaped ordinary character as the ordinary character itself. This becomes problematic when some extensions give special meanings to an otherwise ordinary character (e.g. GNU's \b, \s, \w), meaning we may have two different valid interpretations of the same sequence. To make this easier to deal with and given that the standard calls this undefined, we should throw an error (EESCAPE) if we run into this scenario to ease transition into a state where some escaped ordinaries are blessed with a special meaning -- it will either error out or have extended behavior, rather than have two entirely different versions of undefined behavior that leave the consumer of regex(3) guessing as to what behavior will be used or leaving them with false impressions. This change bumps the symbol version of regcomp to FBSD_1.6 and provides the old escape semantics for legacy applications, just in case one has an older application that would immediately turn into a pumpkin because of an extraneous escape that's embedded or otherwise critical to its operation. This is the final piece needed before enhancing libregex with GNU extensions and flipping the switch on bsdgrep. [1] http://pubs.opengroup.org/onlinepubs/9699919799.2016edition/ PR: 229925 (exp-run, courtesy of antoine) Differential Revision: https://reviews.freebsd.org/D10510
This commit is contained in:
parent
ea83d07e82
commit
adeebf4cd4
@ -4,7 +4,9 @@ a[bc]d & abd abd
|
|||||||
a\*c & a*c a*c
|
a\*c & a*c a*c
|
||||||
a\\b & a\b a\b
|
a\\b & a\b a\b
|
||||||
a\\\*b & a\*b a\*b
|
a\\\*b & a\*b a\*b
|
||||||
a\bc & abc abc
|
# Begin FreeBSD
|
||||||
|
a\bc &C EESCAPE
|
||||||
|
# End FreeBSD
|
||||||
a\ &C EESCAPE
|
a\ &C EESCAPE
|
||||||
a\\bc & a\bc a\bc
|
a\\bc & a\bc a\bc
|
||||||
\{ bC BADRPT
|
\{ bC BADRPT
|
||||||
|
@ -12,7 +12,7 @@ a(b+)c - abbbc abbbc bbb
|
|||||||
a(b*)c - ac ac @c
|
a(b*)c - ac ac @c
|
||||||
(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de
|
(a|ab)(bc([de]+)f|cde) - abcdef abcdef a,bcdef,de
|
||||||
# Begin FreeBSD
|
# Begin FreeBSD
|
||||||
a\(b\|c\)d b ab|cd ab|cd b|c
|
a\(b|c\)d b ab|cd ab|cd b|c
|
||||||
# End FreeBSD
|
# End FreeBSD
|
||||||
# the regression tester only asks for 9 subexpressions
|
# the regression tester only asks for 9 subexpressions
|
||||||
a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j
|
a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j
|
||||||
|
@ -3,8 +3,11 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
FBSD_1.0 {
|
FBSD_1.0 {
|
||||||
regcomp;
|
|
||||||
regerror;
|
regerror;
|
||||||
regexec;
|
regexec;
|
||||||
regfree;
|
regfree;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
FBSD_1.6 {
|
||||||
|
regcomp;
|
||||||
|
};
|
||||||
|
@ -102,11 +102,14 @@ struct parse {
|
|||||||
sopno pend[NPAREN]; /* -> ) ([0] unused) */
|
sopno pend[NPAREN]; /* -> ) ([0] unused) */
|
||||||
bool allowbranch; /* can this expression branch? */
|
bool allowbranch; /* can this expression branch? */
|
||||||
bool bre; /* convenience; is this a BRE? */
|
bool bre; /* convenience; is this a BRE? */
|
||||||
|
int pflags; /* other parsing flags -- legacy escapes? */
|
||||||
bool (*parse_expr)(struct parse *, struct branchc *);
|
bool (*parse_expr)(struct parse *, struct branchc *);
|
||||||
void (*pre_parse)(struct parse *, struct branchc *);
|
void (*pre_parse)(struct parse *, struct branchc *);
|
||||||
void (*post_parse)(struct parse *, struct branchc *);
|
void (*post_parse)(struct parse *, struct branchc *);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define PFLAG_LEGACY_ESC 0x00000001
|
||||||
|
|
||||||
/* ========= begin header generated by ./mkh ========= */
|
/* ========= begin header generated by ./mkh ========= */
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
@ -132,6 +135,7 @@ static void p_b_cclass(struct parse *p, cset *cs);
|
|||||||
static void p_b_eclass(struct parse *p, cset *cs);
|
static void p_b_eclass(struct parse *p, cset *cs);
|
||||||
static wint_t p_b_symbol(struct parse *p);
|
static wint_t p_b_symbol(struct parse *p);
|
||||||
static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
|
static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
|
||||||
|
static bool may_escape(struct parse *p, const wint_t ch);
|
||||||
static wint_t othercase(wint_t ch);
|
static wint_t othercase(wint_t ch);
|
||||||
static void bothcases(struct parse *p, wint_t ch);
|
static void bothcases(struct parse *p, wint_t ch);
|
||||||
static void ordinary(struct parse *p, wint_t ch);
|
static void ordinary(struct parse *p, wint_t ch);
|
||||||
@ -199,22 +203,10 @@ static char nuls[10]; /* place to point scanner in event of error */
|
|||||||
/* Macro used by computejump()/computematchjump() */
|
/* Macro used by computejump()/computematchjump() */
|
||||||
#define MIN(a,b) ((a)<(b)?(a):(b))
|
#define MIN(a,b) ((a)<(b)?(a):(b))
|
||||||
|
|
||||||
/*
|
static int /* 0 success, otherwise REG_something */
|
||||||
- regcomp - interface for parser and compilation
|
regcomp_internal(regex_t * __restrict preg,
|
||||||
= extern int regcomp(regex_t *, const char *, int);
|
|
||||||
= #define REG_BASIC 0000
|
|
||||||
= #define REG_EXTENDED 0001
|
|
||||||
= #define REG_ICASE 0002
|
|
||||||
= #define REG_NOSUB 0004
|
|
||||||
= #define REG_NEWLINE 0010
|
|
||||||
= #define REG_NOSPEC 0020
|
|
||||||
= #define REG_PEND 0040
|
|
||||||
= #define REG_DUMP 0200
|
|
||||||
*/
|
|
||||||
int /* 0 success, otherwise REG_something */
|
|
||||||
regcomp(regex_t * __restrict preg,
|
|
||||||
const char * __restrict pattern,
|
const char * __restrict pattern,
|
||||||
int cflags)
|
int cflags, int pflags)
|
||||||
{
|
{
|
||||||
struct parse pa;
|
struct parse pa;
|
||||||
struct re_guts *g;
|
struct re_guts *g;
|
||||||
@ -273,6 +265,7 @@ regcomp(regex_t * __restrict preg,
|
|||||||
p->end = p->next + len;
|
p->end = p->next + len;
|
||||||
p->error = 0;
|
p->error = 0;
|
||||||
p->ncsalloc = 0;
|
p->ncsalloc = 0;
|
||||||
|
p->pflags = pflags;
|
||||||
for (i = 0; i < NPAREN; i++) {
|
for (i = 0; i < NPAREN; i++) {
|
||||||
p->pbegin[i] = 0;
|
p->pbegin[i] = 0;
|
||||||
p->pend[i] = 0;
|
p->pend[i] = 0;
|
||||||
@ -345,6 +338,43 @@ regcomp(regex_t * __restrict preg,
|
|||||||
return(p->error);
|
return(p->error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
- regcomp - interface for parser and compilation
|
||||||
|
= extern int regcomp(regex_t *, const char *, int);
|
||||||
|
= #define REG_BASIC 0000
|
||||||
|
= #define REG_EXTENDED 0001
|
||||||
|
= #define REG_ICASE 0002
|
||||||
|
= #define REG_NOSUB 0004
|
||||||
|
= #define REG_NEWLINE 0010
|
||||||
|
= #define REG_NOSPEC 0020
|
||||||
|
= #define REG_PEND 0040
|
||||||
|
= #define REG_DUMP 0200
|
||||||
|
*/
|
||||||
|
int /* 0 success, otherwise REG_something */
|
||||||
|
regcomp(regex_t * __restrict preg,
|
||||||
|
const char * __restrict pattern,
|
||||||
|
int cflags)
|
||||||
|
{
|
||||||
|
|
||||||
|
return (regcomp_internal(preg, pattern, cflags, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifndef LIBREGEX
|
||||||
|
/*
|
||||||
|
* Legacy interface that requires more lax escaping behavior.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
freebsd12_regcomp(regex_t * __restrict preg,
|
||||||
|
const char * __restrict pattern,
|
||||||
|
int cflags, int pflags)
|
||||||
|
{
|
||||||
|
|
||||||
|
return (regcomp_internal(preg, pattern, cflags, PFLAG_LEGACY_ESC));
|
||||||
|
}
|
||||||
|
|
||||||
|
__sym_compat(regcomp, freebsd12_regcomp, FBSD_1.0);
|
||||||
|
#endif /* !LIBREGEX */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
|
- p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
|
||||||
- return whether we should terminate or not
|
- return whether we should terminate or not
|
||||||
@ -435,7 +465,10 @@ p_ere_exp(struct parse *p, struct branchc *bc)
|
|||||||
EMIT(OEOW, 0);
|
EMIT(OEOW, 0);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
if (may_escape(p, wc))
|
||||||
ordinary(p, wc);
|
ordinary(p, wc);
|
||||||
|
else
|
||||||
|
SETERROR(REG_EESCAPE);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
@ -797,7 +830,10 @@ p_simp_re(struct parse *p, struct branchc *bc)
|
|||||||
return (false); /* Definitely not $... */
|
return (false); /* Definitely not $... */
|
||||||
p->next--;
|
p->next--;
|
||||||
wc = WGETNEXT();
|
wc = WGETNEXT();
|
||||||
|
if ((c & BACKSL) == 0 || may_escape(p, wc))
|
||||||
ordinary(p, wc);
|
ordinary(p, wc);
|
||||||
|
else
|
||||||
|
SETERROR(REG_EESCAPE);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1094,6 +1130,55 @@ p_b_coll_elem(struct parse *p,
|
|||||||
return(0);
|
return(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
- may_escape - determine whether 'ch' is escape-able in the current context
|
||||||
|
== static int may_escape(struct parse *p, const wint_t ch)
|
||||||
|
*/
|
||||||
|
static bool
|
||||||
|
may_escape(struct parse *p, const wint_t ch)
|
||||||
|
{
|
||||||
|
|
||||||
|
if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
|
||||||
|
return (true);
|
||||||
|
if (isalpha(ch) || ch == '\'' || ch == '`')
|
||||||
|
return (false);
|
||||||
|
return (true);
|
||||||
|
#ifdef NOTYET
|
||||||
|
/*
|
||||||
|
* Build a whitelist of characters that may be escaped to produce an
|
||||||
|
* ordinary in the current context. This assumes that these have not
|
||||||
|
* been otherwise interpreted as a special character. Escaping an
|
||||||
|
* ordinary character yields undefined results according to
|
||||||
|
* IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
|
||||||
|
* advantage of this and use escaped ordinary characters to provide
|
||||||
|
* special meaning, e.g. \b, \B, \w, \W, \s, \S.
|
||||||
|
*/
|
||||||
|
switch(ch) {
|
||||||
|
case '|':
|
||||||
|
case '+':
|
||||||
|
case '?':
|
||||||
|
/* The above characters may not be escaped in BREs */
|
||||||
|
if (!(p->g->cflags®_EXTENDED))
|
||||||
|
return (false);
|
||||||
|
/* Fallthrough */
|
||||||
|
case '(':
|
||||||
|
case ')':
|
||||||
|
case '{':
|
||||||
|
case '}':
|
||||||
|
case '.':
|
||||||
|
case '[':
|
||||||
|
case ']':
|
||||||
|
case '\\':
|
||||||
|
case '*':
|
||||||
|
case '^':
|
||||||
|
case '$':
|
||||||
|
return (true);
|
||||||
|
default:
|
||||||
|
return (false);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
- othercase - return the case counterpart of an alphabetic
|
- othercase - return the case counterpart of an alphabetic
|
||||||
== static wint_t othercase(wint_t ch);
|
== static wint_t othercase(wint_t ch);
|
||||||
|
Loading…
Reference in New Issue
Block a user