sh: Add \u/\U support (in $'...') for UTF-8.
Because we have no iconv in base, support for other charsets is not possible. Note that \u/\U are processed using the locale that was active when the shell started. This is necessary to avoid behaviour that depends on the parse/execute split (for example when placing braces around an entire script). Therefore, UTF-8 encoding is implemented manually.
This commit is contained in:
parent
3a99ed469a
commit
07eb7033a6
@ -76,7 +76,7 @@ __FBSDID("$FreeBSD$");
|
||||
int rootpid;
|
||||
int rootshell;
|
||||
struct jmploc main_handler;
|
||||
int localeisutf8;
|
||||
int localeisutf8, initial_localeisutf8;
|
||||
|
||||
static void read_profile(const char *);
|
||||
static char *find_dot_file(char *);
|
||||
@ -97,7 +97,7 @@ main(int argc, char *argv[])
|
||||
char *shinit;
|
||||
|
||||
(void) setlocale(LC_ALL, "");
|
||||
updatecharset();
|
||||
initcharset();
|
||||
state = 0;
|
||||
if (setjmp(main_handler.loc)) {
|
||||
switch (exception) {
|
||||
|
@ -1219,6 +1219,29 @@ readcstyleesc(char *out)
|
||||
if (v == 0 || (v >= 0xd800 && v <= 0xdfff))
|
||||
synerror("Bad escape sequence");
|
||||
/* We really need iconv here. */
|
||||
if (initial_localeisutf8 && v > 127) {
|
||||
CHECKSTRSPACE(4, out);
|
||||
/*
|
||||
* We cannot use wctomb() as the locale may have
|
||||
* changed.
|
||||
*/
|
||||
if (v <= 0x7ff) {
|
||||
USTPUTC(0xc0 | v >> 6, out);
|
||||
USTPUTC(0x80 | (v & 0x3f), out);
|
||||
return out;
|
||||
} else if (v <= 0xffff) {
|
||||
USTPUTC(0xe0 | v >> 12, out);
|
||||
USTPUTC(0x80 | ((v >> 6) & 0x3f), out);
|
||||
USTPUTC(0x80 | (v & 0x3f), out);
|
||||
return out;
|
||||
} else if (v <= 0x10ffff) {
|
||||
USTPUTC(0xf0 | v >> 18, out);
|
||||
USTPUTC(0x80 | ((v >> 12) & 0x3f), out);
|
||||
USTPUTC(0x80 | ((v >> 6) & 0x3f), out);
|
||||
USTPUTC(0x80 | (v & 0x3f), out);
|
||||
return out;
|
||||
}
|
||||
}
|
||||
if (v > 127)
|
||||
v = '?';
|
||||
break;
|
||||
|
@ -463,8 +463,8 @@ The Unicode code point
|
||||
(eight hexadecimal digits)
|
||||
.El
|
||||
.Pp
|
||||
The sequences for Unicode code points currently only provide useful results
|
||||
for values below 128.
|
||||
The sequences for Unicode code points are currently only useful with
|
||||
UTF-8 locales.
|
||||
They reject code point 0 and UTF-16 surrogates.
|
||||
.Pp
|
||||
If an escape sequence would produce a byte with value 0,
|
||||
|
@ -517,6 +517,13 @@ updatecharset(void)
|
||||
localeisutf8 = !strcmp(charset, "UTF-8");
|
||||
}
|
||||
|
||||
void
|
||||
initcharset(void)
|
||||
{
|
||||
updatecharset();
|
||||
initial_localeisutf8 = localeisutf8;
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate a list of exported variables. This routine is used to construct
|
||||
* the third argument to execve when executing a program.
|
||||
|
@ -83,6 +83,8 @@ extern struct var vterm;
|
||||
#endif
|
||||
|
||||
extern int localeisutf8;
|
||||
/* The parser uses the locale that was in effect at startup. */
|
||||
extern int initial_localeisutf8;
|
||||
|
||||
/*
|
||||
* The following macros access the values of the above variables.
|
||||
@ -116,6 +118,7 @@ char *bltinlookup(const char *, int);
|
||||
void bltinsetlocale(void);
|
||||
void bltinunsetlocale(void);
|
||||
void updatecharset(void);
|
||||
void initcharset(void);
|
||||
char **environment(void);
|
||||
int showvarscmd(int, char **);
|
||||
int exportcmd(int, char **);
|
||||
|
10
tools/regression/bin/sh/parser/dollar-quote10.0
Normal file
10
tools/regression/bin/sh/parser/dollar-quote10.0
Normal file
@ -0,0 +1,10 @@
|
||||
# $FreeBSD$
|
||||
|
||||
# a umlaut
|
||||
s=$(printf '\303\244')
|
||||
# euro sign
|
||||
s=$s$(printf '\342\202\254')
|
||||
|
||||
# Start a new shell so the locale change is picked up.
|
||||
ss="$(LC_ALL=en_US.UTF-8 ${SH} -c "printf %s \$'\u00e4\u20ac'")"
|
||||
[ "$s" = "$ss" ]
|
8
tools/regression/bin/sh/parser/dollar-quote11.0
Normal file
8
tools/regression/bin/sh/parser/dollar-quote11.0
Normal file
@ -0,0 +1,8 @@
|
||||
# $FreeBSD$
|
||||
|
||||
# some sort of 't' outside BMP
|
||||
s=$s$(printf '\360\235\225\245')
|
||||
|
||||
# Start a new shell so the locale change is picked up.
|
||||
ss="$(LC_ALL=en_US.UTF-8 ${SH} -c "printf %s \$'\U0001d565'")"
|
||||
[ "$s" = "$ss" ]
|
Loading…
x
Reference in New Issue
Block a user