sh: Add UTF-8 support to pattern matching.
?, [...] patterns match codepoints instead of bytes. They do not match invalid sequences. [...] patterns must not contain invalid sequences otherwise they will not match anything. This is so that ${var#?} removes the first codepoint, not the first byte, without putting UTF-8 knowledge into the ${var#pattern} code. However, * continues to match any string and an invalid sequence matches an identical invalid sequence. (This differs from fnmatch(3).)
This commit is contained in:
parent
ab0ffb4c88
commit
7cc6b3df80
Notes:
svn2git
2020-12-20 02:59:44 +00:00
svn path=/head/; revision=221646
@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
|
|||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
#include <wchar.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Routines to expand arguments to commands. We have to deal with
|
* Routines to expand arguments to commands. We have to deal with
|
||||||
@ -111,16 +112,16 @@ static void addfname(char *);
|
|||||||
static struct strlist *expsort(struct strlist *);
|
static struct strlist *expsort(struct strlist *);
|
||||||
static struct strlist *msort(struct strlist *, int);
|
static struct strlist *msort(struct strlist *, int);
|
||||||
static char *cvtnum(int, char *);
|
static char *cvtnum(int, char *);
|
||||||
static int collate_range_cmp(int, int);
|
static int collate_range_cmp(wchar_t, wchar_t);
|
||||||
|
|
||||||
static int
|
static int
|
||||||
collate_range_cmp(int c1, int c2)
|
collate_range_cmp(wchar_t c1, wchar_t c2)
|
||||||
{
|
{
|
||||||
static char s1[2], s2[2];
|
static wchar_t s1[2], s2[2];
|
||||||
|
|
||||||
s1[0] = c1;
|
s1[0] = c1;
|
||||||
s2[0] = c2;
|
s2[0] = c2;
|
||||||
return (strcoll(s1, s2));
|
return (wcscoll(s1, s2));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -1377,6 +1378,23 @@ msort(struct strlist *list, int len)
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
static wchar_t
|
||||||
|
get_wc(const char **p)
|
||||||
|
{
|
||||||
|
wchar_t c;
|
||||||
|
int chrlen;
|
||||||
|
|
||||||
|
chrlen = mbtowc(&c, *p, 4);
|
||||||
|
if (chrlen == 0)
|
||||||
|
return 0;
|
||||||
|
else if (chrlen == -1)
|
||||||
|
c = 0;
|
||||||
|
else
|
||||||
|
*p += chrlen;
|
||||||
|
return c;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Returns true if the pattern matches the string.
|
* Returns true if the pattern matches the string.
|
||||||
*/
|
*/
|
||||||
@ -1386,6 +1404,7 @@ patmatch(const char *pattern, const char *string, int squoted)
|
|||||||
{
|
{
|
||||||
const char *p, *q;
|
const char *p, *q;
|
||||||
char c;
|
char c;
|
||||||
|
wchar_t wc, wc2;
|
||||||
|
|
||||||
p = pattern;
|
p = pattern;
|
||||||
q = string;
|
q = string;
|
||||||
@ -1404,7 +1423,11 @@ patmatch(const char *pattern, const char *string, int squoted)
|
|||||||
case '?':
|
case '?':
|
||||||
if (squoted && *q == CTLESC)
|
if (squoted && *q == CTLESC)
|
||||||
q++;
|
q++;
|
||||||
if (*q++ == '\0')
|
if (localeisutf8)
|
||||||
|
wc = get_wc(&q);
|
||||||
|
else
|
||||||
|
wc = *q++;
|
||||||
|
if (wc == '\0')
|
||||||
return 0;
|
return 0;
|
||||||
break;
|
break;
|
||||||
case '*':
|
case '*':
|
||||||
@ -1434,7 +1457,7 @@ patmatch(const char *pattern, const char *string, int squoted)
|
|||||||
case '[': {
|
case '[': {
|
||||||
const char *endp;
|
const char *endp;
|
||||||
int invert, found;
|
int invert, found;
|
||||||
char chr;
|
wchar_t chr;
|
||||||
|
|
||||||
endp = p;
|
endp = p;
|
||||||
if (*endp == '!' || *endp == '^')
|
if (*endp == '!' || *endp == '^')
|
||||||
@ -1455,8 +1478,11 @@ patmatch(const char *pattern, const char *string, int squoted)
|
|||||||
p++;
|
p++;
|
||||||
}
|
}
|
||||||
found = 0;
|
found = 0;
|
||||||
chr = *q++;
|
if (squoted && *q == CTLESC)
|
||||||
if (squoted && chr == CTLESC)
|
q++;
|
||||||
|
if (localeisutf8)
|
||||||
|
chr = get_wc(&q);
|
||||||
|
else
|
||||||
chr = *q++;
|
chr = *q++;
|
||||||
if (chr == '\0')
|
if (chr == '\0')
|
||||||
return 0;
|
return 0;
|
||||||
@ -1466,19 +1492,31 @@ patmatch(const char *pattern, const char *string, int squoted)
|
|||||||
continue;
|
continue;
|
||||||
if (c == CTLESC)
|
if (c == CTLESC)
|
||||||
c = *p++;
|
c = *p++;
|
||||||
|
if (localeisutf8 && c & 0x80) {
|
||||||
|
p--;
|
||||||
|
wc = get_wc(&p);
|
||||||
|
if (wc == 0) /* bad utf-8 */
|
||||||
|
return 0;
|
||||||
|
} else
|
||||||
|
wc = c;
|
||||||
if (*p == '-' && p[1] != ']') {
|
if (*p == '-' && p[1] != ']') {
|
||||||
p++;
|
p++;
|
||||||
while (*p == CTLQUOTEMARK)
|
while (*p == CTLQUOTEMARK)
|
||||||
p++;
|
p++;
|
||||||
if (*p == CTLESC)
|
if (*p == CTLESC)
|
||||||
p++;
|
p++;
|
||||||
if ( collate_range_cmp(chr, c) >= 0
|
if (localeisutf8) {
|
||||||
&& collate_range_cmp(chr, *p) <= 0
|
wc2 = get_wc(&p);
|
||||||
|
if (wc2 == 0) /* bad utf-8 */
|
||||||
|
return 0;
|
||||||
|
} else
|
||||||
|
wc2 = *p++;
|
||||||
|
if ( collate_range_cmp(chr, wc) >= 0
|
||||||
|
&& collate_range_cmp(chr, wc2) <= 0
|
||||||
)
|
)
|
||||||
found = 1;
|
found = 1;
|
||||||
p++;
|
|
||||||
} else {
|
} else {
|
||||||
if (chr == c)
|
if (chr == wc)
|
||||||
found = 1;
|
found = 1;
|
||||||
}
|
}
|
||||||
} while ((c = *p++) != ']');
|
} while ((c = *p++) != ']');
|
||||||
|
57
tools/regression/bin/sh/builtins/case5.0
Normal file
57
tools/regression/bin/sh/builtins/case5.0
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
# $FreeBSD$
|
||||||
|
|
||||||
|
unset LC_ALL
|
||||||
|
LC_CTYPE=en_US.UTF-8
|
||||||
|
export LC_CTYPE
|
||||||
|
|
||||||
|
c1=e
|
||||||
|
# a umlaut
|
||||||
|
c2=$(printf '\303\244')
|
||||||
|
# euro sign
|
||||||
|
c3=$(printf '\342\202\254')
|
||||||
|
# some sort of 't' outside BMP
|
||||||
|
c4=$(printf '\360\235\225\245')
|
||||||
|
|
||||||
|
ok=0
|
||||||
|
case $c1$c2$c3$c4 in
|
||||||
|
*) ok=1 ;;
|
||||||
|
esac
|
||||||
|
if [ $ok = 0 ]; then
|
||||||
|
echo wrong at $LINENO
|
||||||
|
exit 3
|
||||||
|
fi
|
||||||
|
|
||||||
|
case $c1$c2$c3$c4 in
|
||||||
|
$c1$c2$c3$c4) ;;
|
||||||
|
*) echo wrong at $LINENO ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
case $c1$c2$c3$c4 in
|
||||||
|
"$c1$c2$c3$c4") ;;
|
||||||
|
*) echo wrong at $LINENO ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
case $c1$c2$c3$c4 in
|
||||||
|
????) ;;
|
||||||
|
*) echo wrong at $LINENO ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
case $c1.$c2.$c3.$c4 in
|
||||||
|
?.?.?.?) ;;
|
||||||
|
*) echo wrong at $LINENO ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
case $c1$c2$c3$c4 in
|
||||||
|
[!a][!b][!c][!d]) ;;
|
||||||
|
*) echo wrong at $LINENO ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
case $c1$c2$c3$c4 in
|
||||||
|
[$c1][$c2][$c3][$c4]) ;;
|
||||||
|
*) echo wrong at $LINENO ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
case $c1$c2$c3$c4 in
|
||||||
|
["$c1"]["$c2"]["$c3"]["$c4"]) ;;
|
||||||
|
*) echo wrong at $LINENO ;;
|
||||||
|
esac
|
75
tools/regression/bin/sh/expansion/trim8.0
Normal file
75
tools/regression/bin/sh/expansion/trim8.0
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
# $FreeBSD$
|
||||||
|
|
||||||
|
unset LC_ALL
|
||||||
|
LC_CTYPE=en_US.UTF-8
|
||||||
|
export LC_CTYPE
|
||||||
|
|
||||||
|
c1=e
|
||||||
|
# a umlaut
|
||||||
|
c2=$(printf '\303\244')
|
||||||
|
# euro sign
|
||||||
|
c3=$(printf '\342\202\254')
|
||||||
|
# some sort of 't' outside BMP
|
||||||
|
c4=$(printf '\360\235\225\245')
|
||||||
|
|
||||||
|
s=$c1$c2$c3$c4
|
||||||
|
|
||||||
|
testcase() {
|
||||||
|
code="$1"
|
||||||
|
expected="$2"
|
||||||
|
oIFS="$IFS"
|
||||||
|
eval "$code"
|
||||||
|
IFS='|'
|
||||||
|
result="$#|$*"
|
||||||
|
IFS="$oIFS"
|
||||||
|
if [ "x$result" = "x$expected" ]; then
|
||||||
|
ok=x$ok
|
||||||
|
else
|
||||||
|
failures=x$failures
|
||||||
|
echo "For $code, expected $expected actual $result"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
testcase 'set -- "$s"' "1|$s"
|
||||||
|
testcase 'set -- "${s#$c2}"' "1|$s"
|
||||||
|
testcase 'set -- "${s#*}"' "1|$s"
|
||||||
|
testcase 'set -- "${s#$c1}"' "1|$c2$c3$c4"
|
||||||
|
testcase 'set -- "${s#$c1$c2}"' "1|$c3$c4"
|
||||||
|
testcase 'set -- "${s#$c1$c2$c3}"' "1|$c4"
|
||||||
|
testcase 'set -- "${s#$c1$c2$c3$c4}"' "1|"
|
||||||
|
testcase 'set -- "${s#?}"' "1|$c2$c3$c4"
|
||||||
|
testcase 'set -- "${s#??}"' "1|$c3$c4"
|
||||||
|
testcase 'set -- "${s#???}"' "1|$c4"
|
||||||
|
testcase 'set -- "${s#????}"' "1|"
|
||||||
|
testcase 'set -- "${s#*$c3}"' "1|$c4"
|
||||||
|
testcase 'set -- "${s%$c4}"' "1|$c1$c2$c3"
|
||||||
|
testcase 'set -- "${s%$c3$c4}"' "1|$c1$c2"
|
||||||
|
testcase 'set -- "${s%$c2$c3$c4}"' "1|$c1"
|
||||||
|
testcase 'set -- "${s%$c1$c2$c3$c4}"' "1|"
|
||||||
|
testcase 'set -- "${s%?}"' "1|$c1$c2$c3"
|
||||||
|
testcase 'set -- "${s%??}"' "1|$c1$c2"
|
||||||
|
testcase 'set -- "${s%???}"' "1|$c1"
|
||||||
|
testcase 'set -- "${s%????}"' "1|"
|
||||||
|
testcase 'set -- "${s%$c2*}"' "1|$c1"
|
||||||
|
testcase 'set -- "${s##$c2}"' "1|$s"
|
||||||
|
testcase 'set -- "${s##*}"' "1|"
|
||||||
|
testcase 'set -- "${s##$c1}"' "1|$c2$c3$c4"
|
||||||
|
testcase 'set -- "${s##$c1$c2}"' "1|$c3$c4"
|
||||||
|
testcase 'set -- "${s##$c1$c2$c3}"' "1|$c4"
|
||||||
|
testcase 'set -- "${s##$c1$c2$c3$c4}"' "1|"
|
||||||
|
testcase 'set -- "${s##?}"' "1|$c2$c3$c4"
|
||||||
|
testcase 'set -- "${s##??}"' "1|$c3$c4"
|
||||||
|
testcase 'set -- "${s##???}"' "1|$c4"
|
||||||
|
testcase 'set -- "${s##????}"' "1|"
|
||||||
|
testcase 'set -- "${s##*$c3}"' "1|$c4"
|
||||||
|
testcase 'set -- "${s%%$c4}"' "1|$c1$c2$c3"
|
||||||
|
testcase 'set -- "${s%%$c3$c4}"' "1|$c1$c2"
|
||||||
|
testcase 'set -- "${s%%$c2$c3$c4}"' "1|$c1"
|
||||||
|
testcase 'set -- "${s%%$c1$c2$c3$c4}"' "1|"
|
||||||
|
testcase 'set -- "${s%%?}"' "1|$c1$c2$c3"
|
||||||
|
testcase 'set -- "${s%%??}"' "1|$c1$c2"
|
||||||
|
testcase 'set -- "${s%%???}"' "1|$c1"
|
||||||
|
testcase 'set -- "${s%%????}"' "1|"
|
||||||
|
testcase 'set -- "${s%%$c2*}"' "1|$c1"
|
||||||
|
|
||||||
|
test "x$failures" = x
|
Loading…
Reference in New Issue
Block a user