Add POSIX-style support for multibyte characters to od(1): the 'c'

conversion interprets input bytes as multibyte sequences and displays
printable characters in the area corresponding to their first byte.
The remaining bytes are shown as "**".
This commit is contained in:
Tim J. Robbins 2004-07-11 01:11:12 +00:00
parent 7602de354f
commit 40ccfb3137
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=131954
4 changed files with 114 additions and 11 deletions

View File

@ -39,15 +39,30 @@ __FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <assert.h>
#include <stdio.h>
#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
#include <wchar.h>
#include <wctype.h>
#include "hexdump.h"
void
conv_c(PR *pr, u_char *p)
conv_c(PR *pr, u_char *p, size_t bufsize)
{
char buf[10];
char const *str;
wchar_t wc;
size_t clen, oclen;
int converr, pad, width;
char peekbuf[MB_LEN_MAX];
if (pr->mbleft > 0) {
str = "**";
pr->mbleft--;
goto strpr;
}
switch(*p) {
case '\0':
@ -78,9 +93,53 @@ conv_c(PR *pr, u_char *p)
default:
break;
}
if (isprint(*p)) {
*pr->cchar = 'c';
(void)printf(pr->fmt, *p);
/*
* Multibyte characters are disabled for hexdump(1) for backwards
* compatibility and consistency (none of its other output formats
* recognize them correctly).
*/
converr = 0;
if (odmode && MB_CUR_MAX > 1) {
oclen = 0;
retry:
clen = mbrtowc(&wc, p, bufsize, &pr->mbstate);
if (clen == 0)
clen = 1;
else if (clen == (size_t)-1 || (clen == (size_t)-2 &&
buf == peekbuf)) {
memset(&pr->mbstate, 0, sizeof(pr->mbstate));
wc = *p;
clen = 1;
converr = 1;
} else if (clen == (size_t)-2) {
/*
* Incomplete character; peek ahead and see if we
* can complete it.
*/
oclen = bufsize;
bufsize = peek(p = peekbuf, MB_CUR_MAX);
goto retry;
}
clen += oclen;
} else {
wc = *p;
clen = 1;
}
if (!converr && iswprint(wc)) {
if (!odmode) {
*pr->cchar = 'c';
(void)printf(pr->fmt, (int)wc);
} else {
*pr->cchar = 'C';
assert(strcmp(pr->fmt, "%3C") == 0);
width = wcwidth(wc);
assert(width > 0);
pad = 3 - width;
if (pad < 0)
pad = 0;
(void)printf("%*s%C", pad, "", wc);
pr->mbleft = clen - 1;
}
} else {
(void)sprintf(buf, "%03o", (int)*p);
str = buf;

View File

@ -132,7 +132,8 @@ print(PR *pr, u_char *bp)
(void)printf(pr->fmt, "");
break;
case F_C:
conv_c(pr, bp);
conv_c(pr, bp, eaddress ? eaddress - address :
blocksize - address % blocksize);
break;
case F_CHAR:
(void)printf(pr->fmt, *bp);
@ -261,6 +262,10 @@ get(void)
errx(1, "cannot skip past end of input");
if (need == blocksize)
return((u_char *)NULL);
/*
* XXX bcmp() is not quite right in the presence
* of multibyte characters.
*/
if (vflag != ALL &&
valid_save &&
bcmp(curp, savp, nread) == 0) {
@ -284,6 +289,10 @@ get(void)
if (length != -1)
length -= n;
if (!(need -= n)) {
/*
* XXX bcmp() is not quite right in the presence
* of multibyte characters.
*/
if (vflag == ALL || vflag == FIRST ||
valid_save == 0 ||
bcmp(curp, savp, blocksize) != 0) {
@ -303,6 +312,27 @@ get(void)
}
}
size_t
peek(u_char *buf, size_t nbytes)
{
size_t n, nread;
int c;
if (length != -1 && nbytes > length)
nbytes = length;
nread = 0;
while (nread < nbytes && (c = getchar()) != EOF) {
*buf++ = c;
nread++;
}
n = nread;
while (n-- > 0) {
c = *--buf;
ungetc(c, stdin);
}
return (nread);
}
int
next(char **argv)
{

View File

@ -34,6 +34,8 @@
* $FreeBSD$
*/
#include <wchar.h>
typedef struct _pr {
struct _pr *nextpr; /* next print unit */
#define F_ADDRESS 0x001 /* print offset */
@ -52,6 +54,8 @@ typedef struct _pr {
char *cchar; /* conversion character */
char *fmt; /* printf format */
char *nospace; /* no whitespace version */
int mbleft; /* bytes left of multibyte char. */
mbstate_t mbstate; /* conversion state */
} PR;
typedef struct _fu {
@ -88,7 +92,7 @@ void badconv(char *);
void badfmt(const char *);
void badsfmt(void);
void bpad(PR *);
void conv_c(PR *, u_char *);
void conv_c(PR *, u_char *, size_t);
void conv_u(PR *, u_char *);
void display(void);
void doskip(const char *, int);
@ -98,6 +102,7 @@ void newsyntax(int, char ***);
int next(char **);
void nomem(void);
void oldsyntax(int, char ***);
size_t peek(u_char *, size_t);
void rewrite(FS *);
int size(FS *);
void usage(void);

View File

@ -32,7 +32,7 @@
.\" @(#)od.1 8.1 (Berkeley) 6/6/93
.\" $FreeBSD$
.\"
.Dd July 3, 2004
.Dd July 11, 2004
.Os
.Dt OD 1
.Sh NAME
@ -179,6 +179,10 @@ characters, which are represented as C escapes:
.It vertical tab
\ev
.El
.Pp
Multi-byte characters are displayed in the area corresponding to the first
byte of the character. The remaining bytes are shown as
.Ql ** .
.It Xo
.Sm off
.Op Cm d | o | u | x
@ -231,6 +235,15 @@ contain one line for each format.
If no output format is specified,
.Fl t Ar oS
is assumed.
.Sh ENVIRONMENT
The
.Ev LANG , LC_ALL
and
.Ev LC_CTYPE
environment variables affect the execution of
.Nm
as described in
.Xr environ 7 .
.Sh DIAGNOSTICS
.Ex -std
.Sh COMPATIBILITY
@ -252,7 +265,3 @@ An
.Nm
command appeared in
.At v1 .
.Sh BUGS
The
.Nm
utility does not recognize multibyte characters.