diff --git a/gnu/usr.bin/grep/search.c b/gnu/usr.bin/grep/search.c index a1951567e0d9..982d2c5ce098 100644 --- a/gnu/usr.bin/grep/search.c +++ b/gnu/usr.bin/grep/search.c @@ -524,11 +524,16 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) if (mb_cur_max > 1) { const char *s; - int mr; + size_t mr; wchar_t pwc; + /* Locate the start of the multibyte character + before the match position (== beg + start). */ if (using_utf8) { + /* UTF-8 is a special case: scan backwards + until we find a 7-bit character or a + lead byte. */ s = beg + start - 1; while (s > buf && (unsigned char) *s >= 0x80 @@ -536,15 +541,40 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) --s; } else - s = last_char; - mr = mbtowc (&pwc, s, beg + start - s); - if (mr <= 0) + { + /* Scan forwards to find the start of the + last complete character before the + match position. */ + size_t bytes_left = start - 1; + s = beg; + while (bytes_left > 0) + { + mr = mbrlen (s, bytes_left, &mbs); + if (mr == (size_t) -1 || mr == 0) + { + memset (&mbs, '\0', sizeof (mbs)); + s++; + bytes_left--; + continue; + } + if (mr == (size_t) -2) + { + memset (&mbs, '\0', sizeof (mbs)); + break; + } + s += mr; + bytes_left -= mr; + } + } + mr = mbrtowc (&pwc, s, beg + start - s, &mbs); + if (mr == (size_t) -2 || mr == (size_t) -1 || + mr == 0) { memset (&mbs, '\0', sizeof (mbstate_t)); lword_match = 1; } else if (!(iswalnum (pwc) || pwc == L'_') - && mr == (int) (beg + start - s)) + && mr == beg + start - s) lword_match = 1; } else