From d79cdd21de3c3091e563a069f688630499062dab Mon Sep 17 00:00:00 2001 From: Baptiste Daroussin Date: Sun, 1 Nov 2015 12:00:55 +0000 Subject: [PATCH] libc: Fix (and improve) nl_langinfo (CODESET) The output of "locale charmap" is identical to the result of nl_langinfo (CODESET) for any given locale. The logic for returning the codeset was very simplistic. It just returned portion of the locale name after the period (e.g. en_FR.ISO8859-1 returned "ISO8859-1"). When softlinks were added to locales, this broke. e.g.: en_US returned "" en_FR.UTF8 returned "UTF8" en_FR.UTF-8 returned "UTF-8" zh_Hant_HK.Big5HKSCS returned "Big5HKSCS" zh_Hant_TW.Big5 returned "Big5" es_ES@euro returned "" In order to fix this properly, the named locale cannot be used to determine the encoding. This information was almost available in the rune data. Unfortunately, all the single byte encodings were listed as "NONE" encoding. So I adjusted localedef tool to provide more information about the encoding. For example, instead of "NONE", the LC_CTYPE used by fr_FR.ISO8859-15 is now encoded as "NONE:ISO8859-15". The locale handlers now check if the first four characters of the encoding is "NONE" and if so, treats it as a single-byte encoding. The nl_langinfo handling of CODESET was adjusting accordingly. Now the following is returned: en_US returns "ISO8859-1" fr_FR.UTF8 returns "UTF-8" fr_FR.UTF-8 returns "UTF-8" zh_Hant_HK.Big5HKSCS returns "Big5" zh_Hant_TW.Big5 returns "Big5" es_ES@euro returns "ISO8859-15" as before, "C" and "POSIX" locales return "US-ASCII". This is a big improvement. The result of nl_langinfo can never be a zero-length string and it will always exclusively one of the values of the character maps of /usr/src/tools/tools/locale/etc/final-maps. Submitted by: marino Obtained from: DragonflyBSD --- lib/libc/locale/nl_langinfo.c | 30 ++++++++++++++++++++++-------- lib/libc/locale/setrunelocale.c | 2 +- usr.bin/localedef/wide.c | 6 +++--- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/lib/libc/locale/nl_langinfo.c b/lib/libc/locale/nl_langinfo.c index 3e8fe7cc6d3d..e3b370ae2b5f 100644 --- a/lib/libc/locale/nl_langinfo.c +++ b/lib/libc/locale/nl_langinfo.c @@ -37,7 +37,10 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include +#include "mblocal.h" #include "lnumeric.h" #include "lmessages.h" #include "lmonetary.h" @@ -54,14 +57,25 @@ nl_langinfo_l(nl_item item, locale_t loc) switch (item) { case CODESET: - ret = ""; - if ((s = querylocale(LC_CTYPE_MASK, loc)) != NULL) { - if ((cs = strchr(s, '.')) != NULL) - ret = cs + 1; - else if (strcmp(s, "C") == 0 || - strcmp(s, "POSIX") == 0) - ret = "US-ASCII"; - } + s = XLOCALE_CTYPE(loc)->runes->__encoding; + if (strcmp(s, "EUC-CN") == 0) + ret = "eucCN"; + else if (strcmp(s, "EUC-JP") == 0) + ret = "eucJP"; + else if (strcmp(s, "EUC-KR") == 0) + ret = "eucKR"; + else if (strcmp(s, "EUC-TW") == 0) + ret = "eucTW"; + else if (strcmp(s, "BIG5") == 0) + ret = "Big5"; + else if (strcmp(s, "MSKanji") == 0) + ret = "SJIS"; + else if (strcmp(s, "NONE") == 0) + ret = "US-ASCII"; + else if (strncmp(s, "NONE:", 5) == 0) + ret = (char *)(s + 5); + else + ret = (char *)s; break; case D_T_FMT: ret = (char *) __get_current_time_locale(loc)->c_fmt; diff --git a/lib/libc/locale/setrunelocale.c b/lib/libc/locale/setrunelocale.c index 67c632e13425..00e4d985a6b4 100644 --- a/lib/libc/locale/setrunelocale.c +++ b/lib/libc/locale/setrunelocale.c @@ -129,7 +129,7 @@ __setrunelocale(struct xlocale_ctype *l, const char *encoding) rl->__sputrune = NULL; rl->__sgetrune = NULL; - if (strcmp(rl->__encoding, "NONE") == 0) + if (strncmp(rl->__encoding, "NONE", 4) == 0) ret = _none_init(l, rl); else if (strcmp(rl->__encoding, "UTF-8") == 0) ret = _UTF8_init(l, rl); diff --git a/usr.bin/localedef/wide.c b/usr.bin/localedef/wide.c index 1c57fed2de7c..9b5940d79cdc 100644 --- a/usr.bin/localedef/wide.c +++ b/usr.bin/localedef/wide.c @@ -37,7 +37,6 @@ #include __FBSDID("$FreeBSD$"); -#include #include #include #include @@ -62,7 +61,8 @@ static int tomb_mbs(char *, wchar_t); static int (*_towide)(wchar_t *, const char *, unsigned) = towide_none; static int (*_tomb)(char *, wchar_t) = tomb_none; -static const char *_encoding = "NONE"; +static char _encoding_buffer[20] = {'N','O','N','E'}; +static const char *_encoding = _encoding_buffer; static int _nbits = 7; /* @@ -642,9 +642,9 @@ set_wide_encoding(const char *encoding) _towide = towide_none; _tomb = tomb_none; - _encoding = "NONE"; _nbits = 8; + snprint(_encoding_buffer, sizeof(_encoding_buffer), "NONE:%s", encoding); for (i = 0; mb_encodings[i].name; i++) { if (strcasecmp(encoding, mb_encodings[i].name) == 0) { _towide = mb_encodings[i].towide;