Add support for multibyte characters.

This commit is contained in:
Tim J. Robbins 2004-08-02 11:10:20 +00:00
parent 09e7619917
commit 28d92b747e
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=133008
2 changed files with 94 additions and 57 deletions

View File

@ -34,7 +34,7 @@
.\" .\"
.\" Modified by Gareth McCaughan to describe the new version of `fmt' .\" Modified by Gareth McCaughan to describe the new version of `fmt'
.\" rather than the old one. .\" rather than the old one.
.Dd July 17, 2004 .Dd August 2, 2004
.Dt FMT 1 .Dt FMT 1
.Os .Os
.Sh NAME .Sh NAME
@ -154,6 +154,15 @@ the command
.Pp .Pp
will reformat a paragraph, will reformat a paragraph,
evening the lines. evening the lines.
.Sh ENVIRONMENT
The
.Ev LANG , LC_ALL
and
.Ev LC_CTYPE
environment variables affect the execution of
.Nm
as described in
.Xr environ 7 .
.Sh SEE ALSO .Sh SEE ALSO
.Xr mail 1 , .Xr mail 1 ,
.Xr nroff 1 .Xr nroff 1
@ -184,5 +193,3 @@ The
.Nm .Nm
utility is not infallible in guessing what lines are mail headers and what utility is not infallible in guessing what lines are mail headers and what
lines are not. lines are not.
.Pp
Multibyte characters are not handled correctly.

View File

@ -175,7 +175,6 @@ static const char copyright[] =
#include <sys/cdefs.h> #include <sys/cdefs.h>
__FBSDID("$FreeBSD$"); __FBSDID("$FreeBSD$");
#include <ctype.h>
#include <err.h> #include <err.h>
#include <locale.h> #include <locale.h>
#include <stdio.h> #include <stdio.h>
@ -183,6 +182,8 @@ __FBSDID("$FreeBSD$");
#include <string.h> #include <string.h>
#include <sysexits.h> #include <sysexits.h>
#include <unistd.h> #include <unistd.h>
#include <wchar.h>
#include <wctype.h>
/* Something that, we hope, will never be a genuine line length, /* Something that, we hope, will never be a genuine line length,
* indentation etc. * indentation etc.
@ -222,14 +223,15 @@ static int coalesce_spaces_P=0; /* Coalesce multiple whitespace -> ' ' ? */
static int allow_indented_paragraphs=0; /* Can first line have diff. ind.? */ static int allow_indented_paragraphs=0; /* Can first line have diff. ind.? */
static int tab_width=8; /* Number of spaces per tab stop */ static int tab_width=8; /* Number of spaces per tab stop */
static size_t output_tab_width=8; /* Ditto, when squashing leading spaces */ static size_t output_tab_width=8; /* Ditto, when squashing leading spaces */
static const char *sentence_enders=".?!"; /* Double-space after these */ static const wchar_t *sentence_enders=L".?!"; /* Double-space after these */
static int grok_mail_headers=0; /* treat embedded mail headers magically? */ static int grok_mail_headers=0; /* treat embedded mail headers magically? */
static int format_troff=0; /* Format troff? */ static int format_troff=0; /* Format troff? */
static int n_errors=0; /* Number of failed files. Return on exit. */ static int n_errors=0; /* Number of failed files. Return on exit. */
static char *output_buffer=0; /* Output line will be built here */ static wchar_t *output_buffer=0; /* Output line will be built here */
static size_t x; /* Horizontal position in output line */ static size_t x; /* Horizontal position in output line */
static size_t x0; /* Ditto, ignoring leading whitespace */ static size_t x0; /* Ditto, ignoring leading whitespace */
static size_t output_buffer_length = 0;
static size_t pending_spaces; /* Spaces to add before next word */ static size_t pending_spaces; /* Spaces to add before next word */
static int output_in_paragraph=0; /* Any of current para written out yet? */ static int output_in_paragraph=0; /* Any of current para written out yet? */
@ -237,13 +239,14 @@ static int output_in_paragraph=0; /* Any of current para written out yet? */
static void process_named_file (const char *); static void process_named_file (const char *);
static void process_stream (FILE *, const char *); static void process_stream (FILE *, const char *);
static size_t indent_length (const char *, size_t); static size_t indent_length (const wchar_t *, size_t);
static int might_be_header (const unsigned char *); static int might_be_header (const wchar_t *);
static void new_paragraph (size_t, size_t); static void new_paragraph (size_t, size_t);
static void output_word (size_t, size_t, const char *, size_t, size_t); static void output_word (size_t, size_t, const wchar_t *, size_t,
size_t);
static void output_indent (size_t); static void output_indent (size_t);
static void center_stream (FILE *, const char *); static void center_stream (FILE *, const char *);
static char * get_line (FILE *, size_t *); static wchar_t * get_line (FILE *, size_t *);
static void * xrealloc (void *, size_t); static void * xrealloc (void *, size_t);
#define XMALLOC(x) xrealloc(0,x) #define XMALLOC(x) xrealloc(0,x)
@ -254,7 +257,9 @@ static void * xrealloc (void *, size_t);
int int
main(int argc, char *argv[]) { main(int argc, char *argv[]) {
int ch; /* used for |getopt| processing */ int ch; /* used for |getopt| processing */
wchar_t *tmp;
size_t len;
const char *src;
(void) setlocale(LC_CTYPE, ""); (void) setlocale(LC_CTYPE, "");
@ -267,7 +272,13 @@ main(int argc, char *argv[]) {
format_troff = 1; format_troff = 1;
continue; continue;
case 'd': case 'd':
sentence_enders = optarg; src = optarg;
len = mbsrtowcs(NULL, &src, 0, NULL);
if (len == (size_t)-1)
err(EX_USAGE, "bad sentence-ending character set");
tmp = XMALLOC((len + 1) * sizeof(wchar_t));
mbsrtowcs(tmp, &src, len + 1, NULL);
sentence_enders = tmp;
continue; continue;
case 'l': case 'l':
output_tab_width output_tab_width
@ -340,7 +351,8 @@ main(int argc, char *argv[]) {
} }
if (goal_length==0) goal_length = 65; if (goal_length==0) goal_length = 65;
if (max_length==0) max_length = goal_length+10; if (max_length==0) max_length = goal_length+10;
output_buffer = XMALLOC(max_length+1); /* really needn't be longer */ /* really needn't be longer */
output_buffer = XMALLOC((max_length+1) * sizeof(wchar_t));
/* 2. Process files. */ /* 2. Process files. */
@ -365,6 +377,7 @@ process_named_file(const char *name) {
if (!f) { warn("%s", name); ++n_errors; } if (!f) { warn("%s", name); ++n_errors; }
else { else {
process_stream(f, name); process_stream(f, name);
if (ferror(f)) { warn("%s", name); ++n_errors; }
fclose(f); fclose(f);
} }
} }
@ -388,7 +401,7 @@ process_stream(FILE *stream, const char *name) {
size_t first_indent=SILLY; /* indentation of line 0 of paragraph */ size_t first_indent=SILLY; /* indentation of line 0 of paragraph */
HdrType prev_header_type=hdr_ParagraphStart; HdrType prev_header_type=hdr_ParagraphStart;
/* ^-- header_type of previous line; -1 at para start */ /* ^-- header_type of previous line; -1 at para start */
char *line; wchar_t *line;
size_t length; size_t length;
if (centerP) { center_stream(stream, name); return; } if (centerP) { center_stream(stream, name); return; }
@ -424,9 +437,9 @@ process_stream(FILE *stream, const char *name) {
if (header_type==hdr_Header) last_indent=2; /* for cont. lines */ if (header_type==hdr_Header) last_indent=2; /* for cont. lines */
if (length==0 || (line[0]=='.' && !format_troff)) { if (length==0 || (line[0]=='.' && !format_troff)) {
if (length==0) if (length==0)
putchar('\n'); putwchar('\n');
else else
printf("%.*s\n", (int)length, line); wprintf(L"%.*ls\n", (int)length, line);
prev_header_type=hdr_ParagraphStart; prev_header_type=hdr_ParagraphStart;
continue; continue;
} }
@ -465,7 +478,7 @@ process_stream(FILE *stream, const char *name) {
/* How long is the indent on this line? /* How long is the indent on this line?
*/ */
static size_t static size_t
indent_length(const char *line, size_t length) { indent_length(const wchar_t *line, size_t length) {
size_t n=0; size_t n=0;
while (n<length && *line++ == ' ') ++n; while (n<length && *line++ == ' ') ++n;
return n; return n;
@ -478,22 +491,21 @@ indent_length(const char *line, size_t length) {
* conservative to avoid mangling ordinary civilised text. * conservative to avoid mangling ordinary civilised text.
*/ */
static int static int
might_be_header(const unsigned char *line) { might_be_header(const wchar_t *line) {
if (!isupper(*line++)) return 0; if (!iswupper(*line++)) return 0;
while (*line && (isalnum(*line) || *line=='-')) ++line; while (*line && (iswalnum(*line) || *line=='-')) ++line;
return (*line==':' && isspace(line[1])); return (*line==':' && iswspace(line[1]));
} }
/* Begin a new paragraph with an indent of |indent| spaces. /* Begin a new paragraph with an indent of |indent| spaces.
*/ */
static void static void
new_paragraph(size_t old_indent, size_t indent) { new_paragraph(size_t old_indent, size_t indent) {
if (x0) { if (output_buffer_length) {
if (old_indent>0) output_indent(old_indent); if (old_indent>0) output_indent(old_indent);
fwrite(output_buffer, 1, x0, stdout); wprintf(L"%.*ls\n", (int)output_buffer_length, output_buffer);
putchar('\n');
} }
x=indent; x0=0; pending_spaces=0; x=indent; x0=0; output_buffer_length=0; pending_spaces=0;
output_in_paragraph = 0; output_in_paragraph = 0;
} }
@ -503,11 +515,11 @@ static void
output_indent(size_t n_spaces) { output_indent(size_t n_spaces) {
if (output_tab_width) { if (output_tab_width) {
while (n_spaces >= output_tab_width) { while (n_spaces >= output_tab_width) {
putchar('\t'); putwchar('\t');
n_spaces -= output_tab_width; n_spaces -= output_tab_width;
} }
} }
while (n_spaces-- > 0) putchar(' '); while (n_spaces-- > 0) putwchar(' ');
} }
/* Output a single word, or add it to the buffer. /* Output a single word, or add it to the buffer.
@ -515,9 +527,17 @@ output_indent(size_t n_spaces) {
* lines of a paragraph. They'll often be the same, of course. * lines of a paragraph. They'll often be the same, of course.
*/ */
static void static void
output_word(size_t indent0, size_t indent1, const char *word, size_t length, size_t spaces) { output_word(size_t indent0, size_t indent1, const wchar_t *word, size_t length, size_t spaces) {
size_t new_x = x+pending_spaces+length; size_t new_x;
size_t indent = output_in_paragraph ? indent1 : indent0; size_t indent = output_in_paragraph ? indent1 : indent0;
size_t width;
const wchar_t *p;
int cwidth;
for (p = word, width = 0; p < &word[length]; p++)
width += (cwidth = wcwidth(*p)) > 0 ? cwidth : 1;
new_x = x + pending_spaces + width;
/* If either |spaces==0| (at end of line) or |coalesce_spaces_P| /* If either |spaces==0| (at end of line) or |coalesce_spaces_P|
* (squashing internal whitespace), then add just one space; * (squashing internal whitespace), then add just one space;
@ -525,16 +545,17 @@ output_word(size_t indent0, size_t indent1, const char *word, size_t length, siz
* actually add two spaces. * actually add two spaces.
*/ */
if (coalesce_spaces_P || spaces==0) if (coalesce_spaces_P || spaces==0)
spaces = strchr(sentence_enders, word[length-1]) ? 2 : 1; spaces = wcschr(sentence_enders, word[length-1]) ? 2 : 1;
if (new_x<=goal_length) { if (new_x<=goal_length) {
/* After adding the word we still aren't at the goal length, /* After adding the word we still aren't at the goal length,
* so clearly we add it to the buffer rather than outputing it. * so clearly we add it to the buffer rather than outputing it.
*/ */
memset(output_buffer+x0, ' ', pending_spaces); wmemset(output_buffer+output_buffer_length, L' ', pending_spaces);
x0 += pending_spaces; x += pending_spaces; x0 += pending_spaces; x += pending_spaces;
memcpy(output_buffer+x0, word, length); output_buffer_length += pending_spaces;
x0 += length; x += length; wmemcpy(output_buffer+output_buffer_length, word, length);
x0 += width; x += width; output_buffer_length += length;
pending_spaces = spaces; pending_spaces = spaces;
} }
else { else {
@ -545,28 +566,30 @@ output_word(size_t indent0, size_t indent1, const char *word, size_t length, siz
* In case (3) we put a newline in between. * In case (3) we put a newline in between.
*/ */
if (indent>0) output_indent(indent); if (indent>0) output_indent(indent);
fwrite(output_buffer, 1, x0, stdout); wprintf(L"%.*ls", (int)output_buffer_length, output_buffer);
if (x0==0 || (new_x <= max_length && new_x-goal_length <= goal_length-x)) { if (x0==0 || (new_x <= max_length && new_x-goal_length <= goal_length-x)) {
printf("%*s", (int)pending_spaces, ""); wprintf(L"%*ls", (int)pending_spaces, L"");
goto write_out_word; goto write_out_word;
} }
else { else {
/* If the word takes us over the limit on its own, just /* If the word takes us over the limit on its own, just
* spit it out and don't bother buffering it. * spit it out and don't bother buffering it.
*/ */
if (indent+length > max_length) { if (indent+width > max_length) {
putchar('\n'); putwchar('\n');
if (indent>0) output_indent(indent); if (indent>0) output_indent(indent);
write_out_word: write_out_word:
fwrite(word, 1, length, stdout); wprintf(L"%.*ls", (int)length, word);
x0 = 0; x = indent1; pending_spaces = 0; x0 = 0; x = indent1; pending_spaces = 0;
output_buffer_length = 0;
} }
else { else {
memcpy(output_buffer, word, length); wmemcpy(output_buffer, word, length);
x0 = length; x = length+indent1; pending_spaces = spaces; x0 = width; x = width+indent1; pending_spaces = spaces;
output_buffer_length = length;
} }
} }
putchar('\n'); putwchar('\n');
output_in_paragraph = 1; output_in_paragraph = 1;
} }
} }
@ -576,15 +599,19 @@ output_word(size_t indent0, size_t indent1, const char *word, size_t length, siz
*/ */
static void static void
center_stream(FILE *stream, const char *name) { center_stream(FILE *stream, const char *name) {
char *line; wchar_t *line, *p;
size_t length; size_t length;
size_t width;
int cwidth;
while ((line=get_line(stream, &length)) != 0) { while ((line=get_line(stream, &length)) != 0) {
size_t l=length; size_t l=length;
while (l>0 && isspace((unsigned char)*line)) { ++line; --l; } while (l>0 && iswspace(*line)) { ++line; --l; }
length=l; length=l;
while (l<goal_length) { putchar(' '); l+=2; } for (p = line, width = 0; p < &line[length]; p++)
fwrite(line, 1, length, stdout); width += (cwidth = wcwidth(*p)) > 0 ? cwidth : 1;
putchar('\n'); l = width;
while (l<goal_length) { putwchar(' '); l+=2; }
wprintf(L"%.*ls\n", (int)length, line);
} }
if (ferror(stream)) { warn("%s", name); ++n_errors; } if (ferror(stream)) { warn("%s", name); ++n_errors; }
} }
@ -600,32 +627,35 @@ center_stream(FILE *stream, const char *name) {
* Don't confuse |spaces_pending| here with the global * Don't confuse |spaces_pending| here with the global
* |pending_spaces|. * |pending_spaces|.
*/ */
static char * static wchar_t *
get_line(FILE *stream, size_t *lengthp) { get_line(FILE *stream, size_t *lengthp) {
static char *buf=NULL; static wchar_t *buf=NULL;
static size_t length=0; static size_t length=0;
size_t len=0; size_t len=0;
int ch; wint_t ch;
size_t spaces_pending=0; size_t spaces_pending=0;
int troff=0; int troff=0;
size_t col=0;
int cwidth;
if (buf==NULL) { length=100; buf=XMALLOC(length); } if (buf==NULL) { length=100; buf=XMALLOC(length * sizeof(wchar_t)); }
while ((ch=getc(stream)) != '\n' && ch != EOF) { while ((ch=getwc(stream)) != '\n' && ch != WEOF) {
if (len+spaces_pending==0 && ch=='.' && !format_troff) troff=1; if (len+spaces_pending==0 && ch=='.' && !format_troff) troff=1;
if (ch==' ') ++spaces_pending; if (ch==' ') ++spaces_pending;
else if (troff || isprint(ch)) { else if (troff || iswprint(ch)) {
while (len+spaces_pending >= length) { while (len+spaces_pending >= length) {
length*=2; buf=xrealloc(buf, length); length*=2; buf=xrealloc(buf, length * sizeof(wchar_t));
} }
while (spaces_pending > 0) { --spaces_pending; buf[len++]=' '; } while (spaces_pending > 0) { --spaces_pending; buf[len++]=' '; col++; }
buf[len++] = ch; buf[len++] = ch;
col += (cwidth = wcwidth(ch)) > 0 ? cwidth : 1;
} }
else if (ch=='\t') else if (ch=='\t')
spaces_pending += tab_width - (len+spaces_pending)%tab_width; spaces_pending += tab_width - (col+spaces_pending)%tab_width;
else if (ch=='\b') { if (len) --len; } else if (ch=='\b') { if (len) --len; if (col) --col; }
} }
*lengthp=len; *lengthp=len;
return (len>0 || ch!=EOF) ? buf : 0; return (len>0 || ch!=WEOF) ? buf : 0;
} }
/* (Re)allocate some memory, exiting with an error if we can't. /* (Re)allocate some memory, exiting with an error if we can't.