sh: Add \u/\U support (in $'...') for UTF-8.

Because we have no iconv in base, support for other charsets is not
possible.

Note that \u/\U are processed using the locale that was active when the
shell started. This is necessary to avoid behaviour that depends on the
parse/execute split (for example when placing braces around an entire
script). Therefore, UTF-8 encoding is implemented manually.
This commit is contained in:
Jilles Tjoelker 2011-05-08 17:40:10 +00:00
parent 3a99ed469a
commit 07eb7033a6
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=221669
7 changed files with 55 additions and 4 deletions

View File

@ -76,7 +76,7 @@ __FBSDID("$FreeBSD$");
int rootpid; int rootpid;
int rootshell; int rootshell;
struct jmploc main_handler; struct jmploc main_handler;
int localeisutf8; int localeisutf8, initial_localeisutf8;
static void read_profile(const char *); static void read_profile(const char *);
static char *find_dot_file(char *); static char *find_dot_file(char *);
@ -97,7 +97,7 @@ main(int argc, char *argv[])
char *shinit; char *shinit;
(void) setlocale(LC_ALL, ""); (void) setlocale(LC_ALL, "");
updatecharset(); initcharset();
state = 0; state = 0;
if (setjmp(main_handler.loc)) { if (setjmp(main_handler.loc)) {
switch (exception) { switch (exception) {

View File

@ -1219,6 +1219,29 @@ readcstyleesc(char *out)
if (v == 0 || (v >= 0xd800 && v <= 0xdfff)) if (v == 0 || (v >= 0xd800 && v <= 0xdfff))
synerror("Bad escape sequence"); synerror("Bad escape sequence");
/* We really need iconv here. */ /* We really need iconv here. */
if (initial_localeisutf8 && v > 127) {
CHECKSTRSPACE(4, out);
/*
* We cannot use wctomb() as the locale may have
* changed.
*/
if (v <= 0x7ff) {
USTPUTC(0xc0 | v >> 6, out);
USTPUTC(0x80 | (v & 0x3f), out);
return out;
} else if (v <= 0xffff) {
USTPUTC(0xe0 | v >> 12, out);
USTPUTC(0x80 | ((v >> 6) & 0x3f), out);
USTPUTC(0x80 | (v & 0x3f), out);
return out;
} else if (v <= 0x10ffff) {
USTPUTC(0xf0 | v >> 18, out);
USTPUTC(0x80 | ((v >> 12) & 0x3f), out);
USTPUTC(0x80 | ((v >> 6) & 0x3f), out);
USTPUTC(0x80 | (v & 0x3f), out);
return out;
}
}
if (v > 127) if (v > 127)
v = '?'; v = '?';
break; break;

View File

@ -463,8 +463,8 @@ The Unicode code point
(eight hexadecimal digits) (eight hexadecimal digits)
.El .El
.Pp .Pp
The sequences for Unicode code points currently only provide useful results The sequences for Unicode code points are currently only useful with
for values below 128. UTF-8 locales.
They reject code point 0 and UTF-16 surrogates. They reject code point 0 and UTF-16 surrogates.
.Pp .Pp
If an escape sequence would produce a byte with value 0, If an escape sequence would produce a byte with value 0,

View File

@ -517,6 +517,13 @@ updatecharset(void)
localeisutf8 = !strcmp(charset, "UTF-8"); localeisutf8 = !strcmp(charset, "UTF-8");
} }
void
initcharset(void)
{
updatecharset();
initial_localeisutf8 = localeisutf8;
}
/* /*
* Generate a list of exported variables. This routine is used to construct * Generate a list of exported variables. This routine is used to construct
* the third argument to execve when executing a program. * the third argument to execve when executing a program.

View File

@ -83,6 +83,8 @@ extern struct var vterm;
#endif #endif
extern int localeisutf8; extern int localeisutf8;
/* The parser uses the locale that was in effect at startup. */
extern int initial_localeisutf8;
/* /*
* The following macros access the values of the above variables. * The following macros access the values of the above variables.
@ -116,6 +118,7 @@ char *bltinlookup(const char *, int);
void bltinsetlocale(void); void bltinsetlocale(void);
void bltinunsetlocale(void); void bltinunsetlocale(void);
void updatecharset(void); void updatecharset(void);
void initcharset(void);
char **environment(void); char **environment(void);
int showvarscmd(int, char **); int showvarscmd(int, char **);
int exportcmd(int, char **); int exportcmd(int, char **);

View File

@ -0,0 +1,10 @@
# $FreeBSD$
# a umlaut
s=$(printf '\303\244')
# euro sign
s=$s$(printf '\342\202\254')
# Start a new shell so the locale change is picked up.
ss="$(LC_ALL=en_US.UTF-8 ${SH} -c "printf %s \$'\u00e4\u20ac'")"
[ "$s" = "$ss" ]

View File

@ -0,0 +1,8 @@
# $FreeBSD$
# some sort of 't' outside BMP
s=$s$(printf '\360\235\225\245')
# Start a new shell so the locale change is picked up.
ss="$(LC_ALL=en_US.UTF-8 ${SH} -c "printf %s \$'\U0001d565'")"
[ "$s" = "$ss" ]