From 367ed4e13d697ceb415183d8a7acddf5f707667c Mon Sep 17 00:00:00 2001 From: "Andrey A. Chernov" Date: Sat, 13 Oct 2007 16:28:22 +0000 Subject: [PATCH] The problem is: currently our single byte ctype(3) functions are broken for wide characters locales in the argument range >= 0x80 - they may return false positives. Example 1: for UTF-8 locale we currently have: iswspace(0xA0)==1 and isspace(0xA0)==1 (because iswspace() and isspace() are the same code) but must have iswspace(0xA0)==1 and isspace(0xA0)==0 (because there is no such character and all others in the range 0x80..0xff for the UTF-8 locale, it keeps ASCII only in the single byte range because our internal wchar_t representation for UTF-8 is UCS-4). Example 2: for all wide character locales isalpha(arg) when arg > 0xFF may return false positives (must be 0). (because iswalpha() and isalpha() are the same code) This change address this issue separating single byte and wide ctype and also fix iswascii() (currently iswascii() is broken for arguments > 0xFF). This change is 100% binary compatible with old binaries. Reviewied by: i18n@ --- include/_ctype.h | 35 ++++++++++++++++++++++++++- include/ctype.h | 42 ++++++++++++++++----------------- include/wctype.h | 2 +- lib/libc/locale/Symbol.map | 5 ++++ lib/libc/locale/big5.c | 3 +++ lib/libc/locale/euc.c | 3 +++ lib/libc/locale/gb18030.c | 3 +++ lib/libc/locale/gb2312.c | 3 +++ lib/libc/locale/gbk.c | 3 +++ lib/libc/locale/isctype.c | 38 ++++++++++++++--------------- lib/libc/locale/iswctype.c | 2 +- lib/libc/locale/mskanji.c | 3 +++ lib/libc/locale/none.c | 7 +++++- lib/libc/locale/setrunelocale.c | 5 ++++ lib/libc/locale/utf8.c | 3 +++ 15 files changed, 113 insertions(+), 44 deletions(-) diff --git a/include/_ctype.h b/include/_ctype.h index 1a42952a09dc..e27ab24f6c19 100644 --- a/include/_ctype.h +++ b/include/_ctype.h @@ -87,6 +87,8 @@ __END_DECLS #define __inline #endif +extern int __mb_sb_limit; + /* * Use inline functions if we are allowed to and the compiler supports them. */ @@ -102,16 +104,29 @@ __maskrune(__ct_rune_t _c, unsigned long _f) _CurrentRuneLocale->__runetype[_c]) & _f; } +static __inline int +__sbmaskrune(__ct_rune_t _c, unsigned long _f) +{ + return (_c < 0 || _c >= __mb_sb_limit) ? 0 : + _CurrentRuneLocale->__runetype[_c] & _f; +} + static __inline int __istype(__ct_rune_t _c, unsigned long _f) { return (!!__maskrune(_c, _f)); } +static __inline int +__sbistype(__ct_rune_t _c, unsigned long _f) +{ + return (!!__sbmaskrune(_c, _f)); +} + static __inline int __isctype(__ct_rune_t _c, unsigned long _f) { - return (_c < 0 || _c >= _CACHED_RUNES) ? 0 : + return (_c < 0 || _c >= __mb_sb_limit) ? 0 : !!(_DefaultRuneLocale.__runetype[_c] & _f); } @@ -122,6 +137,13 @@ __toupper(__ct_rune_t _c) _CurrentRuneLocale->__mapupper[_c]; } +static __inline __ct_rune_t +__sbtoupper(__ct_rune_t _c) +{ + return (_c < 0 || _c >= __mb_sb_limit) ? _c : + _CurrentRuneLocale->__mapupper[_c]; +} + static __inline __ct_rune_t __tolower(__ct_rune_t _c) { @@ -129,6 +151,13 @@ __tolower(__ct_rune_t _c) _CurrentRuneLocale->__maplower[_c]; } +static __inline __ct_rune_t +__sbtolower(__ct_rune_t _c) +{ + return (_c < 0 || _c >= __mb_sb_limit) ? _c : + _CurrentRuneLocale->__maplower[_c]; +} + static __inline int __wcwidth(__ct_rune_t _c) { @@ -146,10 +175,14 @@ __wcwidth(__ct_rune_t _c) __BEGIN_DECLS int __maskrune(__ct_rune_t, unsigned long); +int __sbmaskrune(__ct_rune_t, unsigned long); int __istype(__ct_rune_t, unsigned long); +int __sbistype(__ct_rune_t, unsigned long); int __isctype(__ct_rune_t, unsigned long); __ct_rune_t __toupper(__ct_rune_t); +__ct_rune_t __sbtoupper(__ct_rune_t); __ct_rune_t __tolower(__ct_rune_t); +__ct_rune_t __sbtolower(__ct_rune_t); int __wcwidth(__ct_rune_t); __END_DECLS #endif /* using inlines */ diff --git a/include/ctype.h b/include/ctype.h index 0825ff652c28..dfc89ccd5951 100644 --- a/include/ctype.h +++ b/include/ctype.h @@ -86,19 +86,19 @@ int isspecial(int); #endif __END_DECLS -#define isalnum(c) __istype((c), _CTYPE_A|_CTYPE_D) -#define isalpha(c) __istype((c), _CTYPE_A) -#define iscntrl(c) __istype((c), _CTYPE_C) +#define isalnum(c) __sbistype((c), _CTYPE_A|_CTYPE_D) +#define isalpha(c) __sbistype((c), _CTYPE_A) +#define iscntrl(c) __sbistype((c), _CTYPE_C) #define isdigit(c) __isctype((c), _CTYPE_D) /* ANSI -- locale independent */ -#define isgraph(c) __istype((c), _CTYPE_G) -#define islower(c) __istype((c), _CTYPE_L) -#define isprint(c) __istype((c), _CTYPE_R) -#define ispunct(c) __istype((c), _CTYPE_P) -#define isspace(c) __istype((c), _CTYPE_S) -#define isupper(c) __istype((c), _CTYPE_U) +#define isgraph(c) __sbistype((c), _CTYPE_G) +#define islower(c) __sbistype((c), _CTYPE_L) +#define isprint(c) __sbistype((c), _CTYPE_R) +#define ispunct(c) __sbistype((c), _CTYPE_P) +#define isspace(c) __sbistype((c), _CTYPE_S) +#define isupper(c) __sbistype((c), _CTYPE_U) #define isxdigit(c) __isctype((c), _CTYPE_X) /* ANSI -- locale independent */ -#define tolower(c) __tolower(c) -#define toupper(c) __toupper(c) +#define tolower(c) __sbtolower(c) +#define toupper(c) __sbtoupper(c) #if __XSI_VISIBLE /* @@ -112,24 +112,24 @@ __END_DECLS * * XXX isascii() and toascii() should similarly be undocumented. */ -#define _tolower(c) __tolower(c) -#define _toupper(c) __toupper(c) +#define _tolower(c) __sbtolower(c) +#define _toupper(c) __sbtoupper(c) #define isascii(c) (((c) & ~0x7F) == 0) #define toascii(c) ((c) & 0x7F) #endif #if __ISO_C_VISIBLE >= 1999 -#define isblank(c) __istype((c), _CTYPE_B) +#define isblank(c) __sbistype((c), _CTYPE_B) #endif #if __BSD_VISIBLE -#define digittoint(c) __maskrune((c), 0xFF) -#define ishexnumber(c) __istype((c), _CTYPE_X) -#define isideogram(c) __istype((c), _CTYPE_I) -#define isnumber(c) __istype((c), _CTYPE_D) -#define isphonogram(c) __istype((c), _CTYPE_Q) -#define isrune(c) __istype((c), 0xFFFFFF00L) -#define isspecial(c) __istype((c), _CTYPE_T) +#define digittoint(c) __sbmaskrune((c), 0xFF) +#define ishexnumber(c) __sbistype((c), _CTYPE_X) +#define isideogram(c) __sbistype((c), _CTYPE_I) +#define isnumber(c) __sbistype((c), _CTYPE_D) +#define isphonogram(c) __sbistype((c), _CTYPE_Q) +#define isrune(c) __sbistype((c), 0xFFFFFF00L) +#define isspecial(c) __sbistype((c), _CTYPE_T) #endif #endif /* !_CTYPE_H_ */ diff --git a/include/wctype.h b/include/wctype.h index 098045fa11a5..31f401ff76e6 100644 --- a/include/wctype.h +++ b/include/wctype.h @@ -106,7 +106,7 @@ __END_DECLS #define towupper(wc) __toupper(wc) #if __BSD_VISIBLE -#define iswascii(wc) (((wc) & ~0x7F) == 0) +#define iswascii(wc) ((wc) < 0x80) #define iswhexnumber(wc) __istype((wc), _CTYPE_X) #define iswideogram(wc) __istype((wc), _CTYPE_I) #define iswnumber(wc) __istype((wc), _CTYPE_D) diff --git a/lib/libc/locale/Symbol.map b/lib/libc/locale/Symbol.map index 12daba14d8b9..20d092bb1076 100644 --- a/lib/libc/locale/Symbol.map +++ b/lib/libc/locale/Symbol.map @@ -60,12 +60,17 @@ FBSD_1.0 { nextwctype; nl_langinfo; __maskrune; + __sbmaskrune; __istype; + __sbistype; __isctype; __toupper; + __sbtoupper; __tolower; + __sbtolower; __wcwidth; __mb_cur_max; + __mb_sb_limit; rpmatch; ___runetype; setlocale; diff --git a/lib/libc/locale/big5.c b/lib/libc/locale/big5.c index 44b9957b5978..19977d035dc5 100644 --- a/lib/libc/locale/big5.c +++ b/lib/libc/locale/big5.c @@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$"); #include #include "mblocal.h" +extern int __mb_sb_limit; + static size_t _BIG5_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _BIG5_mbsinit(const mbstate_t *); @@ -68,6 +70,7 @@ _BIG5_init(_RuneLocale *rl) __mbsinit = _BIG5_mbsinit; _CurrentRuneLocale = rl; __mb_cur_max = 2; + __mb_sb_limit = 128; return (0); } diff --git a/lib/libc/locale/euc.c b/lib/libc/locale/euc.c index b3b35edb5b6c..188073e49e38 100644 --- a/lib/libc/locale/euc.c +++ b/lib/libc/locale/euc.c @@ -49,6 +49,8 @@ __FBSDID("$FreeBSD$"); #include #include "mblocal.h" +extern int __mb_sb_limit; + static size_t _EUC_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _EUC_mbsinit(const mbstate_t *); @@ -116,6 +118,7 @@ _EUC_init(_RuneLocale *rl) __mbrtowc = _EUC_mbrtowc; __wcrtomb = _EUC_wcrtomb; __mbsinit = _EUC_mbsinit; + __mb_sb_limit = 256; return (0); } diff --git a/lib/libc/locale/gb18030.c b/lib/libc/locale/gb18030.c index 3e43179d9213..1457d3e7540c 100644 --- a/lib/libc/locale/gb18030.c +++ b/lib/libc/locale/gb18030.c @@ -39,6 +39,8 @@ __FBSDID("$FreeBSD$"); #include #include "mblocal.h" +extern int __mb_sb_limit; + static size_t _GB18030_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _GB18030_mbsinit(const mbstate_t *); @@ -59,6 +61,7 @@ _GB18030_init(_RuneLocale *rl) __mbsinit = _GB18030_mbsinit; _CurrentRuneLocale = rl; __mb_cur_max = 4; + __mb_sb_limit = 128; return (0); } diff --git a/lib/libc/locale/gb2312.c b/lib/libc/locale/gb2312.c index 232dabae866e..74a7bdc3be79 100644 --- a/lib/libc/locale/gb2312.c +++ b/lib/libc/locale/gb2312.c @@ -35,6 +35,8 @@ __FBSDID("$FreeBSD$"); #include #include "mblocal.h" +extern int __mb_sb_limit; + static size_t _GB2312_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _GB2312_mbsinit(const mbstate_t *); @@ -55,6 +57,7 @@ _GB2312_init(_RuneLocale *rl) __wcrtomb = _GB2312_wcrtomb; __mbsinit = _GB2312_mbsinit; __mb_cur_max = 2; + __mb_sb_limit = 128; return (0); } diff --git a/lib/libc/locale/gbk.c b/lib/libc/locale/gbk.c index 5288293c1660..802f78e7c177 100644 --- a/lib/libc/locale/gbk.c +++ b/lib/libc/locale/gbk.c @@ -42,6 +42,8 @@ __FBSDID("$FreeBSD$"); #include #include "mblocal.h" +extern int __mb_sb_limit; + static size_t _GBK_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _GBK_mbsinit(const mbstate_t *); @@ -61,6 +63,7 @@ _GBK_init(_RuneLocale *rl) __mbsinit = _GBK_mbsinit; _CurrentRuneLocale = rl; __mb_cur_max = 2; + __mb_sb_limit = 128; return (0); } diff --git a/lib/libc/locale/isctype.c b/lib/libc/locale/isctype.c index 13ac6c274974..be1b091237fc 100644 --- a/lib/libc/locale/isctype.c +++ b/lib/libc/locale/isctype.c @@ -48,7 +48,7 @@ int digittoint(c) int c; { - return (__maskrune(c, 0xFF)); + return (__sbmaskrune(c, 0xFF)); } #undef isalnum @@ -56,7 +56,7 @@ int isalnum(c) int c; { - return (__istype(c, _CTYPE_A|_CTYPE_D)); + return (__sbistype(c, _CTYPE_A|_CTYPE_D)); } #undef isalpha @@ -64,7 +64,7 @@ int isalpha(c) int c; { - return (__istype(c, _CTYPE_A)); + return (__sbistype(c, _CTYPE_A)); } #undef isascii @@ -80,7 +80,7 @@ int isblank(c) int c; { - return (__istype(c, _CTYPE_B)); + return (__sbistype(c, _CTYPE_B)); } #undef iscntrl @@ -88,7 +88,7 @@ int iscntrl(c) int c; { - return (__istype(c, _CTYPE_C)); + return (__sbistype(c, _CTYPE_C)); } #undef isdigit @@ -104,7 +104,7 @@ int isgraph(c) int c; { - return (__istype(c, _CTYPE_G)); + return (__sbistype(c, _CTYPE_G)); } #undef ishexnumber @@ -112,7 +112,7 @@ int ishexnumber(c) int c; { - return (__istype(c, _CTYPE_X)); + return (__sbistype(c, _CTYPE_X)); } #undef isideogram @@ -120,7 +120,7 @@ int isideogram(c) int c; { - return (__istype(c, _CTYPE_I)); + return (__sbistype(c, _CTYPE_I)); } #undef islower @@ -128,7 +128,7 @@ int islower(c) int c; { - return (__istype(c, _CTYPE_L)); + return (__sbistype(c, _CTYPE_L)); } #undef isnumber @@ -136,7 +136,7 @@ int isnumber(c) int c; { - return (__istype(c, _CTYPE_D)); + return (__sbistype(c, _CTYPE_D)); } #undef isphonogram @@ -144,7 +144,7 @@ int isphonogram(c) int c; { - return (__istype(c, _CTYPE_Q)); + return (__sbistype(c, _CTYPE_Q)); } #undef isprint @@ -152,7 +152,7 @@ int isprint(c) int c; { - return (__istype(c, _CTYPE_R)); + return (__sbistype(c, _CTYPE_R)); } #undef ispunct @@ -160,7 +160,7 @@ int ispunct(c) int c; { - return (__istype(c, _CTYPE_P)); + return (__sbistype(c, _CTYPE_P)); } #undef isrune @@ -168,7 +168,7 @@ int isrune(c) int c; { - return (__istype(c, 0xFFFFFF00L)); + return (__sbistype(c, 0xFFFFFF00L)); } #undef isspace @@ -176,7 +176,7 @@ int isspace(c) int c; { - return (__istype(c, _CTYPE_S)); + return (__sbistype(c, _CTYPE_S)); } #undef isspecial @@ -184,7 +184,7 @@ int isspecial(c) int c; { - return (__istype(c, _CTYPE_T)); + return (__sbistype(c, _CTYPE_T)); } #undef isupper @@ -192,7 +192,7 @@ int isupper(c) int c; { - return (__istype(c, _CTYPE_U)); + return (__sbistype(c, _CTYPE_U)); } #undef isxdigit @@ -216,7 +216,7 @@ int tolower(c) int c; { - return (__tolower(c)); + return (__sbtolower(c)); } #undef toupper @@ -224,6 +224,6 @@ int toupper(c) int c; { - return (__toupper(c)); + return (__sbtoupper(c)); } diff --git a/lib/libc/locale/iswctype.c b/lib/libc/locale/iswctype.c index eaa1bf3d2780..c2e0f539a3c1 100644 --- a/lib/libc/locale/iswctype.c +++ b/lib/libc/locale/iswctype.c @@ -61,7 +61,7 @@ int iswascii(wc) wint_t wc; { - return ((wc & ~0x7F) == 0); + return (wc < 0x80); } #undef iswblank diff --git a/lib/libc/locale/mskanji.c b/lib/libc/locale/mskanji.c index aba87e75ec86..9ee91de28c15 100644 --- a/lib/libc/locale/mskanji.c +++ b/lib/libc/locale/mskanji.c @@ -47,6 +47,8 @@ __FBSDID("$FreeBSD$"); #include #include "mblocal.h" +extern int __mb_sb_limit; + static size_t _MSKanji_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _MSKanji_mbsinit(const mbstate_t *); @@ -66,6 +68,7 @@ _MSKanji_init(_RuneLocale *rl) __mbsinit = _MSKanji_mbsinit; _CurrentRuneLocale = rl; __mb_cur_max = 2; + __mb_sb_limit = 256; return (0); } diff --git a/lib/libc/locale/none.c b/lib/libc/locale/none.c index 79981e5b93c7..22fcd203937b 100644 --- a/lib/libc/locale/none.c +++ b/lib/libc/locale/none.c @@ -58,6 +58,11 @@ static size_t _none_wcrtomb(char * __restrict, wchar_t, static size_t _none_wcsnrtombs(char * __restrict, const wchar_t ** __restrict, size_t, size_t, mbstate_t * __restrict); +/* setup defaults */ + +int __mb_cur_max = 1; +int __mb_sb_limit = 256; /* Expected to be <= _CACHED_RUNES */ + int _none_init(_RuneLocale *rl) { @@ -69,6 +74,7 @@ _none_init(_RuneLocale *rl) __wcsnrtombs = _none_wcsnrtombs; _CurrentRuneLocale = rl; __mb_cur_max = 1; + __mb_sb_limit = 256; return(0); } @@ -176,7 +182,6 @@ _none_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src, /* setup defaults */ -int __mb_cur_max = 1; size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict) = _none_mbrtowc; int (*__mbsinit)(const mbstate_t *) = _none_mbsinit; diff --git a/lib/libc/locale/setrunelocale.c b/lib/libc/locale/setrunelocale.c index 948470934d55..e723ea627aff 100644 --- a/lib/libc/locale/setrunelocale.c +++ b/lib/libc/locale/setrunelocale.c @@ -45,6 +45,8 @@ __FBSDID("$FreeBSD$"); #include "mblocal.h" #include "setlocale.h" +extern int __mb_sb_limit; + extern _RuneLocale *_Read_RuneMagi(FILE *); static int __setrunelocale(const char *); @@ -59,6 +61,7 @@ __setrunelocale(const char *encoding) static char ctype_encoding[ENCODING_LEN + 1]; static _RuneLocale *CachedRuneLocale; static int Cached__mb_cur_max; + static int Cached__mb_sb_limit; static size_t (*Cached__mbrtowc)(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static size_t (*Cached__wcrtomb)(char * __restrict, wchar_t, @@ -85,6 +88,7 @@ __setrunelocale(const char *encoding) strcmp(encoding, ctype_encoding) == 0) { _CurrentRuneLocale = CachedRuneLocale; __mb_cur_max = Cached__mb_cur_max; + __mb_sb_limit = Cached__mb_sb_limit; __mbrtowc = Cached__mbrtowc; __mbsinit = Cached__mbsinit; __mbsnrtowcs = Cached__mbsnrtowcs; @@ -147,6 +151,7 @@ __setrunelocale(const char *encoding) } CachedRuneLocale = _CurrentRuneLocale; Cached__mb_cur_max = __mb_cur_max; + Cached__mb_sb_limit = __mb_sb_limit; Cached__mbrtowc = __mbrtowc; Cached__mbsinit = __mbsinit; Cached__mbsnrtowcs = __mbsnrtowcs; diff --git a/lib/libc/locale/utf8.c b/lib/libc/locale/utf8.c index e467fc053390..086a1e49d041 100644 --- a/lib/libc/locale/utf8.c +++ b/lib/libc/locale/utf8.c @@ -35,6 +35,8 @@ __FBSDID("$FreeBSD$"); #include #include "mblocal.h" +extern int __mb_sb_limit; + static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, mbstate_t * __restrict); static int _UTF8_mbsinit(const mbstate_t *); @@ -63,6 +65,7 @@ _UTF8_init(_RuneLocale *rl) __wcsnrtombs = _UTF8_wcsnrtombs; _CurrentRuneLocale = rl; __mb_cur_max = 6; + __mb_sb_limit = 128; return (0); }