From 07eb7033a6e87970afa4f02a568fa25fce79bd04 Mon Sep 17 00:00:00 2001
From: Jilles Tjoelker <jilles@FreeBSD.org>
Date: Sun, 8 May 2011 17:40:10 +0000
Subject: [PATCH] sh: Add \u/\U support (in $'...') for UTF-8.

Because we have no iconv in base, support for other charsets is not
possible.

Note that \u/\U are processed using the locale that was active when the
shell started. This is necessary to avoid behaviour that depends on the
parse/execute split (for example when placing braces around an entire
script). Therefore, UTF-8 encoding is implemented manually.
---
 bin/sh/main.c                                 |  4 ++--
 bin/sh/parser.c                               | 23 +++++++++++++++++++
 bin/sh/sh.1                                   |  4 ++--
 bin/sh/var.c                                  |  7 ++++++
 bin/sh/var.h                                  |  3 +++
 .../regression/bin/sh/parser/dollar-quote10.0 | 10 ++++++++
 .../regression/bin/sh/parser/dollar-quote11.0 |  8 +++++++
 7 files changed, 55 insertions(+), 4 deletions(-)
 create mode 100644 tools/regression/bin/sh/parser/dollar-quote10.0
 create mode 100644 tools/regression/bin/sh/parser/dollar-quote11.0

diff --git a/bin/sh/main.c b/bin/sh/main.c
index d3250eb0eaea..d9629204647b 100644
--- a/bin/sh/main.c
+++ b/bin/sh/main.c
@@ -76,7 +76,7 @@ __FBSDID("$FreeBSD$");
 int rootpid;
 int rootshell;
 struct jmploc main_handler;
-int localeisutf8;
+int localeisutf8, initial_localeisutf8;
 
 static void read_profile(const char *);
 static char *find_dot_file(char *);
@@ -97,7 +97,7 @@ main(int argc, char *argv[])
 	char *shinit;
 
 	(void) setlocale(LC_ALL, "");
-	updatecharset();
+	initcharset();
 	state = 0;
 	if (setjmp(main_handler.loc)) {
 		switch (exception) {
diff --git a/bin/sh/parser.c b/bin/sh/parser.c
index 5133d67f9bfa..cc1860df22fd 100644
--- a/bin/sh/parser.c
+++ b/bin/sh/parser.c
@@ -1219,6 +1219,29 @@ readcstyleesc(char *out)
 		  if (v == 0 || (v >= 0xd800 && v <= 0xdfff))
 			  synerror("Bad escape sequence");
 		  /* We really need iconv here. */
+		  if (initial_localeisutf8 && v > 127) {
+			  CHECKSTRSPACE(4, out);
+			  /*
+			   * We cannot use wctomb() as the locale may have
+			   * changed.
+			   */
+			  if (v <= 0x7ff) {
+				  USTPUTC(0xc0 | v >> 6, out);
+				  USTPUTC(0x80 | (v & 0x3f), out);
+				  return out;
+			  } else if (v <= 0xffff) {
+				  USTPUTC(0xe0 | v >> 12, out);
+				  USTPUTC(0x80 | ((v >> 6) & 0x3f), out);
+				  USTPUTC(0x80 | (v & 0x3f), out);
+				  return out;
+			  } else if (v <= 0x10ffff) {
+				  USTPUTC(0xf0 | v >> 18, out);
+				  USTPUTC(0x80 | ((v >> 12) & 0x3f), out);
+				  USTPUTC(0x80 | ((v >> 6) & 0x3f), out);
+				  USTPUTC(0x80 | (v & 0x3f), out);
+				  return out;
+			  }
+		  }
 		  if (v > 127)
 			  v = '?';
 		  break;
diff --git a/bin/sh/sh.1 b/bin/sh/sh.1
index 5710060ca7f8..838cafeccf79 100644
--- a/bin/sh/sh.1
+++ b/bin/sh/sh.1
@@ -463,8 +463,8 @@ The Unicode code point
 (eight hexadecimal digits)
 .El
 .Pp
-The sequences for Unicode code points currently only provide useful results
-for values below 128.
+The sequences for Unicode code points are currently only useful with
+UTF-8 locales.
 They reject code point 0 and UTF-16 surrogates.
 .Pp
 If an escape sequence would produce a byte with value 0,
diff --git a/bin/sh/var.c b/bin/sh/var.c
index 33890db1ad2a..84b6dea66d4c 100644
--- a/bin/sh/var.c
+++ b/bin/sh/var.c
@@ -517,6 +517,13 @@ updatecharset(void)
 	localeisutf8 = !strcmp(charset, "UTF-8");
 }
 
+void
+initcharset(void)
+{
+	updatecharset();
+	initial_localeisutf8 = localeisutf8;
+}
+
 /*
  * Generate a list of exported variables.  This routine is used to construct
  * the third argument to execve when executing a program.
diff --git a/bin/sh/var.h b/bin/sh/var.h
index 323ddc2308d1..ff21c7da41fd 100644
--- a/bin/sh/var.h
+++ b/bin/sh/var.h
@@ -83,6 +83,8 @@ extern struct var vterm;
 #endif
 
 extern int localeisutf8;
+/* The parser uses the locale that was in effect at startup. */
+extern int initial_localeisutf8;
 
 /*
  * The following macros access the values of the above variables.
@@ -116,6 +118,7 @@ char *bltinlookup(const char *, int);
 void bltinsetlocale(void);
 void bltinunsetlocale(void);
 void updatecharset(void);
+void initcharset(void);
 char **environment(void);
 int showvarscmd(int, char **);
 int exportcmd(int, char **);
diff --git a/tools/regression/bin/sh/parser/dollar-quote10.0 b/tools/regression/bin/sh/parser/dollar-quote10.0
new file mode 100644
index 000000000000..ad166da23ffe
--- /dev/null
+++ b/tools/regression/bin/sh/parser/dollar-quote10.0
@@ -0,0 +1,10 @@
+# $FreeBSD$
+
+# a umlaut
+s=$(printf '\303\244')
+# euro sign
+s=$s$(printf '\342\202\254')
+
+# Start a new shell so the locale change is picked up.
+ss="$(LC_ALL=en_US.UTF-8 ${SH} -c "printf %s \$'\u00e4\u20ac'")"
+[ "$s" = "$ss" ]
diff --git a/tools/regression/bin/sh/parser/dollar-quote11.0 b/tools/regression/bin/sh/parser/dollar-quote11.0
new file mode 100644
index 000000000000..2e872abfe5b0
--- /dev/null
+++ b/tools/regression/bin/sh/parser/dollar-quote11.0
@@ -0,0 +1,8 @@
+# $FreeBSD$
+
+# some sort of 't' outside BMP
+s=$s$(printf '\360\235\225\245')
+
+# Start a new shell so the locale change is picked up.
+ss="$(LC_ALL=en_US.UTF-8 ${SH} -c "printf %s \$'\U0001d565'")"
+[ "$s" = "$ss" ]