sh: Add $'quoting' (C-style escape sequences).

A string between $' and ' may contain backslash escape sequences similar to
the ones in a C string constant (except that a single-quote must be escaped
and a double-quote need not be). Details are in the sh(1) man page.

This construct is useful to include unprintable characters, tabs and
newlines in strings; while this can be done with a command substitution
containing a printf command, that needs ugly workarounds if the result is to
end with a newline as command substitution removes all trailing newlines.

The construct may also be useful in future to describe unprintable
characters without needing to write those characters themselves in 'set -x',
'export -p' and the like.

The implementation attempts to comply to the proposal for the next issue of
the POSIX specification. Because this construct is not in POSIX.1-2008,
using it in scripts intended to be portable is unwise.

Matching the minimal locale support in the rest of sh, the \u and \U
sequences are currently not useful.

Exp-run done by: pav (with some other sh(1) changes)
This commit is contained in:
jilles 2011-05-05 20:55:55 +00:00
parent 7ec44d66a6
commit 5a49f52603
12 changed files with 312 additions and 8 deletions

View File

@ -64,6 +64,7 @@ struct synclass synclass[] = {
{ "CWORD", "character is nothing special" },
{ "CNL", "newline character" },
{ "CBACK", "a backslash character" },
{ "CSBACK", "a backslash character in single quotes" },
{ "CSQUOTE", "single quote" },
{ "CDQUOTE", "double quote" },
{ "CENDQUOTE", "a terminating quote" },
@ -224,6 +225,7 @@ main(int argc __unused, char **argv __unused)
init();
fputs("\n/* syntax table used when in single quotes */\n", cfile);
add("\n", "CNL");
add("\\", "CSBACK");
add("'", "CENDQUOTE");
/* ':/' for tilde expansion, '-' for [a\-x] pattern ranges */
add("!*?[=~:/-", "CCTL");

View File

@ -1126,6 +1126,127 @@ parsebackq(char *out, struct nodelist **pbqlist,
}
/*
* Called to parse a backslash escape sequence inside $'...'.
* The backslash has already been read.
*/
static char *
readcstyleesc(char *out)
{
int c, v, i, n;
c = pgetc();
switch (c) {
case '\0':
synerror("Unterminated quoted string");
case '\n':
plinno++;
if (doprompt)
setprompt(2);
else
setprompt(0);
return out;
case '\\':
case '\'':
case '"':
v = c;
break;
case 'a': v = '\a'; break;
case 'b': v = '\b'; break;
case 'e': v = '\033'; break;
case 'f': v = '\f'; break;
case 'n': v = '\n'; break;
case 'r': v = '\r'; break;
case 't': v = '\t'; break;
case 'v': v = '\v'; break;
case 'x':
v = 0;
for (;;) {
c = pgetc();
if (c >= '0' && c <= '9')
v = (v << 4) + c - '0';
else if (c >= 'A' && c <= 'F')
v = (v << 4) + c - 'A' + 10;
else if (c >= 'a' && c <= 'f')
v = (v << 4) + c - 'a' + 10;
else
break;
}
pungetc();
break;
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
v = c - '0';
c = pgetc();
if (c >= '0' && c <= '7') {
v <<= 3;
v += c - '0';
c = pgetc();
if (c >= '0' && c <= '7') {
v <<= 3;
v += c - '0';
} else
pungetc();
} else
pungetc();
break;
case 'c':
c = pgetc();
if (c < 0x3f || c > 0x7a || c == 0x60)
synerror("Bad escape sequence");
if (c == '\\' && pgetc() != '\\')
synerror("Bad escape sequence");
if (c == '?')
v = 127;
else
v = c & 0x1f;
break;
case 'u':
case 'U':
n = c == 'U' ? 8 : 4;
v = 0;
for (i = 0; i < n; i++) {
c = pgetc();
if (c >= '0' && c <= '9')
v = (v << 4) + c - '0';
else if (c >= 'A' && c <= 'F')
v = (v << 4) + c - 'A' + 10;
else if (c >= 'a' && c <= 'f')
v = (v << 4) + c - 'a' + 10;
else
synerror("Bad escape sequence");
}
if (v == 0 || (v >= 0xd800 && v <= 0xdfff))
synerror("Bad escape sequence");
/* We really need iconv here. */
if (v > 127)
v = '?';
break;
default:
synerror("Bad escape sequence");
}
v = (char)v;
/*
* We can't handle NUL bytes.
* POSIX says we should skip till the closing quote.
*/
if (v == '\0') {
while ((c = pgetc()) != '\'') {
if (c == '\\')
c = pgetc();
if (c == PEOF)
synerror("Unterminated quoted string");
}
pungetc();
return out;
}
if (SQSYNTAX[v] == CCTL)
USTPUTC(CTLESC, out);
USTPUTC(v, out);
return out;
}
/*
* If eofmark is NULL, read a word or a redirection symbol. If eofmark
* is not NULL, read a here document. In the latter case, eofmark is the
@ -1158,6 +1279,7 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs)
struct tokenstate state_static[MAXNEST_static];
int maxnest = MAXNEST_static;
struct tokenstate *state = state_static;
int sqiscstyle = 0;
startlinno = plinno;
quotef = 0;
@ -1188,6 +1310,12 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs)
setprompt(0);
c = pgetc();
goto loop; /* continue outer loop */
case CSBACK:
if (sqiscstyle) {
out = readcstyleesc(out);
break;
}
/* FALLTHROUGH */
case CWORD:
USTPUTC(c, out);
break;
@ -1232,6 +1360,7 @@ readtoken1(int firstc, char const *initialsyntax, char *eofmark, int striptabs)
case CSQUOTE:
USTPUTC(CTLQUOTEMARK, out);
state[level].syntax = SQSYNTAX;
sqiscstyle = 0;
break;
case CDQUOTE:
USTPUTC(CTLQUOTEMARK, out);
@ -1450,11 +1579,7 @@ parsesub: {
int c1;
c = pgetc();
if (c != '(' && c != '{' && (is_eof(c) || !is_name(c)) &&
!is_special(c)) {
USTPUTC('$', out);
pungetc();
} else if (c == '(') { /* $(command) or $((arith)) */
if (c == '(') { /* $(command) or $((arith)) */
if (pgetc() == '(') {
PARSEARITH();
} else {
@ -1465,7 +1590,7 @@ parsesub: {
state[level].syntax == DQSYNTAX ||
state[level].syntax == ARISYNTAX);
}
} else {
} else if (c == '{' || is_name(c) || is_special(c)) {
USTPUTC(CTLVAR, out);
typeloc = out - stackblock();
USTPUTC(VSNORMAL, out);
@ -1612,6 +1737,14 @@ parsesub: {
newvarnest++;
}
}
} else if (c == '\'' && state[level].syntax == BASESYNTAX) {
/* $'cstylequotes' */
USTPUTC(CTLQUOTEMARK, out);
state[level].syntax = SQSYNTAX;
sqiscstyle = 1;
} else {
USTPUTC('$', out);
pungetc();
}
goto parsesub_return;
}

View File

@ -32,7 +32,7 @@
.\" from: @(#)sh.1 8.6 (Berkeley) 5/4/95
.\" $FreeBSD$
.\"
.Dd March 20, 2011
.Dd May 5, 2011
.Dt SH 1
.Os
.Sh NAME
@ -396,13 +396,82 @@ Quoting is used to remove the special meaning of certain characters
or words to the shell, such as operators, whitespace, keywords,
or alias names.
.Pp
There are three types of quoting: matched single quotes,
There are four types of quoting: matched single quotes,
dollar-single quotes,
matched double quotes, and backslash.
.Bl -tag -width indent
.It Single Quotes
Enclosing characters in single quotes preserves the literal
meaning of all the characters (except single quotes, making
it impossible to put single-quotes in a single-quoted string).
.It Dollar-Single Quotes
Enclosing characters between
.Li $'
and
.Li '
preserves the literal meaning of all characters
except backslashes and single quotes.
A backslash introduces a C-style escape sequence:
.Bl -tag -width xUnnnnnnnn
.It \ea
Alert (ring the terminal bell)
.It \eb
Backspace
.It \ec Ns Ar c
The control character denoted by
.Li ^ Ns Ar c
in
.Xr stty 1 .
If
.Ar c
is a backslash, it must be doubled.
.It \ee
The ESC character
.Tn ( ASCII
0x1b)
.It \ef
Formfeed
.It \en
Newline
.It \er
Carriage return
.It \et
Horizontal tab
.It \ev
Vertical tab
.It \e\e
Literal backslash
.It \e\&'
Literal single-quote
.It \e\&"
Literal double-quote
.It \e Ns Ar nnn
The byte whose octal value is
.Ar nnn
(one to three digits)
.It \ex Ns Ar nn
The byte whose hexadecimal value is
.Ar nn
(one or more digits only the last two of which are used)
.It \eu Ns Ar nnnn
The Unicode code point
.Ar nnnn
(four hexadecimal digits)
.It \eU Ns Ar nnnnnnnn
The Unicode code point
.Ar nnnnnnnn
(eight hexadecimal digits)
.El
.Pp
The sequences for Unicode code points currently only provide useful results
for values below 128.
They reject code point 0 and UTF-16 surrogates.
.Pp
If an escape sequence would produce a byte with value 0,
that byte and the rest of the string until the matching single-quote
are ignored.
.Pp
Any other string starting with a backslash is an error.
.It Double Quotes
Enclosing characters within double quotes preserves the literal
meaning of all characters except dollar sign

View File

@ -0,0 +1,12 @@
# $FreeBSD$
set -e
[ $'hi' = hi ]
[ $'hi
there' = 'hi
there' ]
[ $'\"\'\\\a\b\f\t\v' = "\"'\\$(printf "\a\b\f\t\v")" ]
[ $'hi\nthere' = 'hi
there' ]
[ $'a\rb' = "$(printf "a\rb")" ]

View File

@ -0,0 +1,5 @@
# $FreeBSD$
# This depends on the ASCII character set.
[ $'\e' = "$(printf "\033")" ]

View File

@ -0,0 +1,22 @@
# $FreeBSD$
unset LC_ALL
LC_CTYPE=en_US.ISO8859-1
export LC_CTYPE
e=
for i in 0 1 2 3; do
for j in 0 1 2 3 4 5 6 7; do
for k in 0 1 2 3 4 5 6 7; do
case $i$j$k in
000) continue ;;
esac
e="$e\\$i$j$k"
done
done
done
ee=`printf "$e"`
[ "${#ee}" = 255 ] || echo length bad
# Start a new shell so the locale change is picked up.
[ "$(${SH} -c "printf %s \$'$e'")" = "$ee" ]

View File

@ -0,0 +1,19 @@
# $FreeBSD$
unset LC_ALL
LC_CTYPE=en_US.ISO8859-1
export LC_CTYPE
e=
for i in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
for j in 0 1 2 3 4 5 6 7 8 9 a b c d e f; do
case $i$j in
00) continue ;;
esac
e="$e\x$i$j"
done
done
# Start a new shell so the locale change is picked up.
ee="$(${SH} -c "printf %s \$'$e'")"
[ "${#ee}" = 255 ] || echo length bad

View File

@ -0,0 +1,12 @@
# $FreeBSD$
# This depends on the ASCII character set.
set -e
[ $'\ca\cb\cc\cd\ce\cf\cg\ch\ci\cj\ck\cl\cm\cn\co\cp\cq\cr\cs\ct\cu\cv\cw\cx\cy\cz' = $'\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032' ]
[ $'\cA\cB\cC\cD\cE\cF\cG\cH\cI\cJ\cK\cL\cM\cN\cO\cP\cQ\cR\cS\cT\cU\cV\cW\cX\cY\cZ' = $'\001\002\003\004\005\006\007\010\011\012\013\014\015\016\017\020\021\022\023\024\025\026\027\030\031\032' ]
[ $'\c[' = $'\033' ]
[ $'\c]' = $'\035' ]
[ $'\c^' = $'\036' ]
[ $'\c_' = $'\037' ]

View File

@ -0,0 +1,5 @@
# $FreeBSD$
# This depends on the ASCII character set.
[ $'\c\\' = $'\034' ]

View File

@ -0,0 +1,6 @@
# $FreeBSD$
set -e
[ $'\u0024\u0040\u0060' = '$@`' ]
[ $'\U00000024\U00000040\U00000060' = '$@`' ]

View File

@ -0,0 +1,11 @@
# $FreeBSD$
[ $'hello\0' = hello ]
[ $'hello\0world' = hello ]
[ $'hello\0'$'world' = helloworld ]
[ $'hello\000' = hello ]
[ $'hello\000world' = hello ]
[ $'hello\000'$'world' = helloworld ]
[ $'hello\x00' = hello ]
[ $'hello\x00world' = hello ]
[ $'hello\x00'$'world' = helloworld ]

View File

@ -0,0 +1,8 @@
# $FreeBSD$
# POSIX and C99 say D800-DFFF are undefined in a universal character name.
# We reject this but many other shells expand to something that looks like
# CESU-8.
v=$( (eval ": \$'\uD800'") 2>&1 >/dev/null)
[ $? -ne 0 ] && [ -n "$v" ]