sh: Add \u/\U support (in $'...') for UTF-8.

Because we have no iconv in base, support for other charsets is not
possible.

Note that \u/\U are processed using the locale that was active when the
shell started. This is necessary to avoid behaviour that depends on the
parse/execute split (for example when placing braces around an entire
script). Therefore, UTF-8 encoding is implemented manually.
This commit is contained in:
Jilles Tjoelker 2011-05-08 17:40:10 +00:00
parent 3a99ed469a
commit 07eb7033a6
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=221669
7 changed files with 55 additions and 4 deletions

View File

@ -76,7 +76,7 @@ __FBSDID("$FreeBSD$");
int rootpid;
int rootshell;
struct jmploc main_handler;
int localeisutf8;
int localeisutf8, initial_localeisutf8;
static void read_profile(const char *);
static char *find_dot_file(char *);
@ -97,7 +97,7 @@ main(int argc, char *argv[])
char *shinit;
(void) setlocale(LC_ALL, "");
updatecharset();
initcharset();
state = 0;
if (setjmp(main_handler.loc)) {
switch (exception) {

View File

@ -1219,6 +1219,29 @@ readcstyleesc(char *out)
if (v == 0 || (v >= 0xd800 && v <= 0xdfff))
synerror("Bad escape sequence");
/* We really need iconv here. */
if (initial_localeisutf8 && v > 127) {
CHECKSTRSPACE(4, out);
/*
* We cannot use wctomb() as the locale may have
* changed.
*/
if (v <= 0x7ff) {
USTPUTC(0xc0 | v >> 6, out);
USTPUTC(0x80 | (v & 0x3f), out);
return out;
} else if (v <= 0xffff) {
USTPUTC(0xe0 | v >> 12, out);
USTPUTC(0x80 | ((v >> 6) & 0x3f), out);
USTPUTC(0x80 | (v & 0x3f), out);
return out;
} else if (v <= 0x10ffff) {
USTPUTC(0xf0 | v >> 18, out);
USTPUTC(0x80 | ((v >> 12) & 0x3f), out);
USTPUTC(0x80 | ((v >> 6) & 0x3f), out);
USTPUTC(0x80 | (v & 0x3f), out);
return out;
}
}
if (v > 127)
v = '?';
break;

View File

@ -463,8 +463,8 @@ The Unicode code point
(eight hexadecimal digits)
.El
.Pp
The sequences for Unicode code points currently only provide useful results
for values below 128.
The sequences for Unicode code points are currently only useful with
UTF-8 locales.
They reject code point 0 and UTF-16 surrogates.
.Pp
If an escape sequence would produce a byte with value 0,

View File

@ -517,6 +517,13 @@ updatecharset(void)
localeisutf8 = !strcmp(charset, "UTF-8");
}
void
initcharset(void)
{
updatecharset();
initial_localeisutf8 = localeisutf8;
}
/*
* Generate a list of exported variables. This routine is used to construct
* the third argument to execve when executing a program.

View File

@ -83,6 +83,8 @@ extern struct var vterm;
#endif
extern int localeisutf8;
/* The parser uses the locale that was in effect at startup. */
extern int initial_localeisutf8;
/*
* The following macros access the values of the above variables.
@ -116,6 +118,7 @@ char *bltinlookup(const char *, int);
void bltinsetlocale(void);
void bltinunsetlocale(void);
void updatecharset(void);
void initcharset(void);
char **environment(void);
int showvarscmd(int, char **);
int exportcmd(int, char **);

View File

@ -0,0 +1,10 @@
# $FreeBSD$
# a umlaut
s=$(printf '\303\244')
# euro sign
s=$s$(printf '\342\202\254')
# Start a new shell so the locale change is picked up.
ss="$(LC_ALL=en_US.UTF-8 ${SH} -c "printf %s \$'\u00e4\u20ac'")"
[ "$s" = "$ss" ]

View File

@ -0,0 +1,8 @@
# $FreeBSD$
# some sort of 't' outside BMP
s=$s$(printf '\360\235\225\245')
# Start a new shell so the locale change is picked up.
ss="$(LC_ALL=en_US.UTF-8 ${SH} -c "printf %s \$'\U0001d565'")"
[ "$s" = "$ss" ]