sh: Add UTF-8 support to pattern matching.

?, [...] patterns match codepoints instead of bytes. They do not match invalid sequences. [...] patterns must not contain invalid sequences otherwise they will not match anything. This is so that ${var#?} removes the first codepoint, not the first byte, without putting UTF-8 knowledge into the ${var#pattern} code. However, * continues to match any string and an invalid sequence matches an identical invalid sequence. (This differs from fnmatch(3).)
svn path=/head/; revision=221646
2011-05-08 11:32:20 +00:00 · 2011-05-08 11:32:20 +00:00 · 7cc6b3df80 · 2020-12-20 02:59:44 +00:00
commit 7cc6b3df80
parent ab0ffb4c88
3 changed files with 182 additions and 12 deletions
--- a/bin/sh/expand.c
+++ b/bin/sh/expand.c
@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <wchar.h>
 /*
 * Routines to expand arguments to commands.  We have to deal with
@ -111,16 +112,16 @@ static void addfname(char *);
 static struct strlist *expsort(struct strlist *);
 static struct strlist *msort(struct strlist *, int);
 static char *cvtnum(int, char *);
-static int collate_range_cmp(int, int);
+static int collate_range_cmp(wchar_t, wchar_t);
 static int
-collate_range_cmp(int c1, int c2)
+collate_range_cmp(wchar_t c1, wchar_t c2)
 {
-	static char s1[2], s2[2];
+	static wchar_t s1[2], s2[2];
 	s1[0] = c1;
 	s2[0] = c2;
-	return (strcoll(s1, s2));
+	return (wcscoll(s1, s2));
 }
 /*
@ -1377,6 +1378,23 @@ msort(struct strlist *list, int len)
 static wchar_t
 get_wc(const char **p)
 {
 	wchar_t c;
 	int chrlen;
 	chrlen = mbtowc(&c, *p, 4);
 	if (chrlen == 0)
 		return 0;
 	else if (chrlen == -1)
 		c = 0;
 	else
 		*p += chrlen;
 	return c;
 }
 /*
 * Returns true if the pattern matches the string.
 */
@ -1386,6 +1404,7 @@ patmatch(const char *pattern, const char *string, int squoted)
 {
 	const char *p, *q;
 	char c;
 	wchar_t wc, wc2;
 	p = pattern;
 	q = string;
@ -1404,7 +1423,11 @@ patmatch(const char *pattern, const char *string, int squoted)
 		case '?':
 			if (squoted && *q == CTLESC)
 				q++;
-			if (*q++ == '\0')
+			if (localeisutf8)
 				wc = get_wc(&q);
 			else
 				wc = *q++;
 			if (wc == '\0')
 				return 0;
 			break;
 		case '*':
@ -1434,7 +1457,7 @@ patmatch(const char *pattern, const char *string, int squoted)
 		case '[': {
 			const char *endp;
 			int invert, found;
-			char chr;
+			wchar_t chr;
 			endp = p;
 			if (*endp == '!' || *endp == '^')
@ -1455,8 +1478,11 @@ patmatch(const char *pattern, const char *string, int squoted)
 				p++;
 			}
 			found = 0;
-			chr = *q++;
+			if (squoted && *q == CTLESC)
-			if (squoted && chr == CTLESC)
+				q++;
 			if (localeisutf8)
 				chr = get_wc(&q);
 			else
 				chr = *q++;
 			if (chr == '\0')
 				return 0;
@ -1466,19 +1492,31 @@ patmatch(const char *pattern, const char *string, int squoted)
 					continue;
 				if (c == CTLESC)
 					c = *p++;
 				if (localeisutf8 && c & 0x80) {
 					p--;
 					wc = get_wc(&p);
 					if (wc == 0) /* bad utf-8 */
 						return 0;
 				} else
 					wc = c;
 				if (*p == '-' && p[1] != ']') {
 					p++;
 					while (*p == CTLQUOTEMARK)
 						p++;
 					if (*p == CTLESC)
 						p++;
-					if (   collate_range_cmp(chr, c) >= 0
+					if (localeisutf8) {
-					    && collate_range_cmp(chr, *p) <= 0
+						wc2 = get_wc(&p);
 						if (wc2 == 0) /* bad utf-8 */
 							return 0;
 					} else
 						wc2 = *p++;
 					if (   collate_range_cmp(chr, wc) >= 0
 					    && collate_range_cmp(chr, wc2) <= 0
 					   )
 						found = 1;
 					p++;
 				} else {
-					if (chr == c)
+					if (chr == wc)
 						found = 1;
 				}
 			} while ((c = *p++) != ']');
--- a/tools/regression/bin/sh/builtins/case5.0
+++ b/tools/regression/bin/sh/builtins/case5.0
@ -0,0 +1,57 @@
 # $FreeBSD$
 unset LC_ALL
 LC_CTYPE=en_US.UTF-8
 export LC_CTYPE
 c1=e
 # a umlaut
 c2=$(printf '\303\244')
 # euro sign
 c3=$(printf '\342\202\254')
 # some sort of 't' outside BMP
 c4=$(printf '\360\235\225\245')
 ok=0
 case $c1$c2$c3$c4 in
 *) ok=1 ;;
 esac
 if [ $ok = 0 ]; then
 	echo wrong at $LINENO
 	exit 3
 fi
 case $c1$c2$c3$c4 in
 $c1$c2$c3$c4) ;;
 *) echo wrong at $LINENO ;;
 esac
 case $c1$c2$c3$c4 in
 "$c1$c2$c3$c4") ;;
 *) echo wrong at $LINENO ;;
 esac
 case $c1$c2$c3$c4 in
 ????) ;;
 *) echo wrong at $LINENO ;;
 esac
 case $c1.$c2.$c3.$c4 in
 ?.?.?.?) ;;
 *) echo wrong at $LINENO ;;
 esac
 case $c1$c2$c3$c4 in
 [!a][!b][!c][!d]) ;;
 *) echo wrong at $LINENO ;;
 esac
 case $c1$c2$c3$c4 in
 [$c1][$c2][$c3][$c4]) ;;
 *) echo wrong at $LINENO ;;
 esac
 case $c1$c2$c3$c4 in
 ["$c1"]["$c2"]["$c3"]["$c4"]) ;;
 *) echo wrong at $LINENO ;;
 esac
--- a/tools/regression/bin/sh/expansion/trim8.0
+++ b/tools/regression/bin/sh/expansion/trim8.0
@ -0,0 +1,75 @@
 # $FreeBSD$
 unset LC_ALL
 LC_CTYPE=en_US.UTF-8
 export LC_CTYPE
 c1=e
 # a umlaut
 c2=$(printf '\303\244')
 # euro sign
 c3=$(printf '\342\202\254')
 # some sort of 't' outside BMP
 c4=$(printf '\360\235\225\245')
 s=$c1$c2$c3$c4
 testcase() {
 	code="$1"
 	expected="$2"
 	oIFS="$IFS"
 	eval "$code"
 	IFS='|'
 	result="$#|$*"
 	IFS="$oIFS"
 	if [ "x$result" = "x$expected" ]; then
 		ok=x$ok
 	else
 		failures=x$failures
 		echo "For $code, expected $expected actual $result"
 	fi
 }
 testcase 'set -- "$s"'				"1|$s"
 testcase 'set -- "${s#$c2}"'			"1|$s"
 testcase 'set -- "${s#*}"'			"1|$s"
 testcase 'set -- "${s#$c1}"'			"1|$c2$c3$c4"
 testcase 'set -- "${s#$c1$c2}"'			"1|$c3$c4"
 testcase 'set -- "${s#$c1$c2$c3}"'		"1|$c4"
 testcase 'set -- "${s#$c1$c2$c3$c4}"'		"1|"
 testcase 'set -- "${s#?}"'			"1|$c2$c3$c4"
 testcase 'set -- "${s#??}"'			"1|$c3$c4"
 testcase 'set -- "${s#???}"'			"1|$c4"
 testcase 'set -- "${s#????}"'			"1|"
 testcase 'set -- "${s#*$c3}"'			"1|$c4"
 testcase 'set -- "${s%$c4}"'			"1|$c1$c2$c3"
 testcase 'set -- "${s%$c3$c4}"'			"1|$c1$c2"
 testcase 'set -- "${s%$c2$c3$c4}"'		"1|$c1"
 testcase 'set -- "${s%$c1$c2$c3$c4}"'		"1|"
 testcase 'set -- "${s%?}"'			"1|$c1$c2$c3"
 testcase 'set -- "${s%??}"'			"1|$c1$c2"
 testcase 'set -- "${s%???}"'			"1|$c1"
 testcase 'set -- "${s%????}"'			"1|"
 testcase 'set -- "${s%$c2*}"'			"1|$c1"
 testcase 'set -- "${s##$c2}"'			"1|$s"
 testcase 'set -- "${s##*}"'			"1|"
 testcase 'set -- "${s##$c1}"'			"1|$c2$c3$c4"
 testcase 'set -- "${s##$c1$c2}"'		"1|$c3$c4"
 testcase 'set -- "${s##$c1$c2$c3}"'		"1|$c4"
 testcase 'set -- "${s##$c1$c2$c3$c4}"'		"1|"
 testcase 'set -- "${s##?}"'			"1|$c2$c3$c4"
 testcase 'set -- "${s##??}"'			"1|$c3$c4"
 testcase 'set -- "${s##???}"'			"1|$c4"
 testcase 'set -- "${s##????}"'			"1|"
 testcase 'set -- "${s##*$c3}"'			"1|$c4"
 testcase 'set -- "${s%%$c4}"'			"1|$c1$c2$c3"
 testcase 'set -- "${s%%$c3$c4}"'		"1|$c1$c2"
 testcase 'set -- "${s%%$c2$c3$c4}"'		"1|$c1"
 testcase 'set -- "${s%%$c1$c2$c3$c4}"'		"1|"
 testcase 'set -- "${s%%?}"'			"1|$c1$c2$c3"
 testcase 'set -- "${s%%??}"'			"1|$c1$c2"
 testcase 'set -- "${s%%???}"'			"1|$c1"
 testcase 'set -- "${s%%????}"'			"1|"
 testcase 'set -- "${s%%$c2*}"'			"1|$c1"
 test "x$failures" = x