2238 lines
65 KiB
C
2238 lines
65 KiB
C
/* Permuted index for GNU, with keywords in their context.
|
||
Copyright (C) 1990, 1991, 1993 Free Software Foundation, Inc.
|
||
Francois Pinard <pinard@iro.umontreal.ca>, 1988.
|
||
|
||
This program is free software; you can redistribute it and/or modify
|
||
it under the terms of the GNU General Public License as published by
|
||
the Free Software Foundation; either version 2, or (at your option)
|
||
any later version.
|
||
|
||
This program is distributed in the hope that it will be useful, but
|
||
WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||
General Public License for more details.
|
||
|
||
You should have received a copy of the GNU General Public License
|
||
along with this program; if not, write to the Free Software
|
||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||
*/
|
||
|
||
#ifdef HAVE_CONFIG_H
|
||
#include "config.h"
|
||
#endif
|
||
|
||
const char *version_string = "GNU ptx version 0.3";
|
||
|
||
char *const copyright = "\
|
||
This program is free software; you can redistribute it and/or modify\n\
|
||
it under the terms of the GNU General Public License as published by\n\
|
||
the Free Software Foundation; either version 2, or (at your option)\n\
|
||
any later version.\n\
|
||
\n\
|
||
This program is distributed in the hope that it will be useful,\n\
|
||
but WITHOUT ANY WARRANTY; without even the implied warranty of\n\
|
||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\n\
|
||
GNU General Public License for more details.\n\
|
||
\n\
|
||
You should have received a copy of the GNU General Public License\n\
|
||
along with this program; if not, write to the Free Software\n\
|
||
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.\n";
|
||
|
||
/* Reallocation step when swallowing non regular files. The value is not
|
||
the actual reallocation step, but its base two logarithm. */
|
||
#define SWALLOW_REALLOC_LOG 12
|
||
|
||
/* Imported from "regex.c". */
|
||
#define Sword 1
|
||
|
||
#ifdef STDC_HEADERS
|
||
|
||
#include <stdlib.h>
|
||
#include <ctype.h>
|
||
|
||
#else /* not STDC_HEADERS */
|
||
|
||
/* These definitions work, for all 256 characters. */
|
||
#define isspace(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
|
||
#define isxdigit(c) \
|
||
(((unsigned char) (c) >= 'a' && (unsigned char) (c) <= 'f') \
|
||
|| ((unsigned char) (c) >= 'A' && (unsigned char) (c) <= 'F') \
|
||
|| ((unsigned char) (c) >= '0' && (unsigned char) (c) <= '9'))
|
||
#define islower(c) ((unsigned char) (c) >= 'a' && (unsigned char) (c) <= 'z')
|
||
#define isupper(c) ((unsigned char) (c) >= 'A' && (unsigned char) (c) <= 'Z')
|
||
#define isalpha(c) (islower (c) || isupper (c))
|
||
#define toupper(c) (islower (c) ? (c) - 'a' + 'A' : (c))
|
||
|
||
#endif /* not STDC_HEADERS */
|
||
|
||
#if !defined (isascii) || defined (STDC_HEADERS)
|
||
#undef isascii
|
||
#define isascii(c) 1
|
||
#endif
|
||
|
||
#define ISXDIGIT(c) (isascii (c) && isxdigit (c))
|
||
#define ISODIGIT(c) ((c) >= '0' && (c) <= '7')
|
||
#define HEXTOBIN(c) ((c)>='a'&&(c)<='f' ? (c)-'a'+10 : (c)>='A'&&(c)<='F' ? (c)-'A'+10 : (c)-'0')
|
||
#define OCTTOBIN(c) ((c) - '0')
|
||
|
||
#include <stdio.h>
|
||
#include <fcntl.h>
|
||
#include <sys/types.h>
|
||
#include <sys/stat.h>
|
||
|
||
#if !defined(S_ISREG) && defined(S_IFREG)
|
||
#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG)
|
||
#endif
|
||
|
||
#ifdef HAVE_STRING_H
|
||
#include <string.h>
|
||
#else /* not HAVE_STRING_H */
|
||
#include <strings.h>
|
||
#define strchr index
|
||
#define strrchr rindex
|
||
#endif /* not HAVE_STRING_H */
|
||
|
||
#include "getopt.h"
|
||
|
||
#include <errno.h>
|
||
#ifndef errno
|
||
extern int errno;
|
||
#endif
|
||
|
||
#include "bumpalloc.h"
|
||
#include "diacrit.h"
|
||
#include "regex.h"
|
||
|
||
#ifndef __STDC__
|
||
void *xmalloc ();
|
||
void *xrealloc ();
|
||
#else
|
||
void *xmalloc (int);
|
||
void *xrealloc (void *, int);
|
||
#endif
|
||
|
||
|
||
/* Global definitions. */
|
||
|
||
const char *program_name; /* name of this program */
|
||
static int show_help = 0; /* display usage information and exit */
|
||
static int show_version = 0; /* print the version and exit */
|
||
|
||
/* Program options. */
|
||
|
||
enum Format
|
||
{
|
||
DUMB_FORMAT, /* output for a dumb terminal */
|
||
ROFF_FORMAT, /* output for `troff' or `nroff' */
|
||
TEX_FORMAT, /* output for `TeX' or `LaTeX' */
|
||
UNKNOWN_FORMAT /* output format still unknown */
|
||
};
|
||
|
||
int gnu_extensions = 1; /* trigger all GNU extensions */
|
||
int auto_reference = 0; /* references are `file_name:line_number:' */
|
||
int input_reference = 0; /* references at beginning of input lines */
|
||
int right_reference = 0; /* output references after right context */
|
||
int line_width = 72; /* output line width in characters */
|
||
int gap_size = 3; /* number of spaces between output fields */
|
||
const char *truncation_string = "/";
|
||
/* string used to mark line truncations */
|
||
const char *macro_name = "xx"; /* macro name for roff or TeX output */
|
||
enum Format output_format = UNKNOWN_FORMAT;
|
||
/* output format */
|
||
|
||
int ignore_case = 0; /* fold lower to upper case for sorting */
|
||
const char *context_regex_string = NULL;
|
||
/* raw regex for end of context */
|
||
const char *word_regex_string = NULL;
|
||
/* raw regex for a keyword */
|
||
const char *break_file = NULL; /* name of the `Break characters' file */
|
||
const char *only_file = NULL; /* name of the `Only words' file */
|
||
const char *ignore_file = NULL; /* name of the `Ignore words' file */
|
||
|
||
/* A BLOCK delimit a region in memory of arbitrary size, like the copy of a
|
||
whole file. A WORD is something smaller, its length should fit in a
|
||
short integer. A WORD_TABLE may contain several WORDs. */
|
||
|
||
typedef struct
|
||
{
|
||
char *start; /* pointer to beginning of region */
|
||
char *end; /* pointer to end + 1 of region */
|
||
}
|
||
BLOCK;
|
||
|
||
typedef struct
|
||
{
|
||
char *start; /* pointer to beginning of region */
|
||
short size; /* length of the region */
|
||
}
|
||
WORD;
|
||
|
||
typedef struct
|
||
{
|
||
WORD *start; /* array of WORDs */
|
||
size_t length; /* number of entries */
|
||
}
|
||
WORD_TABLE;
|
||
|
||
/* Pattern description tables. */
|
||
|
||
/* For each character, provide its folded equivalent. */
|
||
unsigned char folded_chars[CHAR_SET_SIZE];
|
||
|
||
/* For each character, indicate if it is part of a word. */
|
||
char syntax_table[CHAR_SET_SIZE];
|
||
char *re_syntax_table = syntax_table;
|
||
|
||
/* Compiled regex for end of context. */
|
||
struct re_pattern_buffer *context_regex;
|
||
|
||
/* End of context pattern register indices. */
|
||
struct re_registers context_regs;
|
||
|
||
/* Compiled regex for a keyword. */
|
||
struct re_pattern_buffer *word_regex;
|
||
|
||
/* Keyword pattern register indices. */
|
||
struct re_registers word_regs;
|
||
|
||
/* A word characters fastmap is used only when no word regexp has been
|
||
provided. A word is then made up of a sequence of one or more characters
|
||
allowed by the fastmap. Contains !0 if character allowed in word. Not
|
||
only this is faster in most cases, but it simplifies the implementation
|
||
of the Break files. */
|
||
char word_fastmap[CHAR_SET_SIZE];
|
||
|
||
/* Maximum length of any word read. */
|
||
int maximum_word_length;
|
||
|
||
/* Maximum width of any reference used. */
|
||
int reference_max_width;
|
||
|
||
|
||
/* Ignore and Only word tables. */
|
||
|
||
WORD_TABLE ignore_table; /* table of words to ignore */
|
||
WORD_TABLE only_table; /* table of words to select */
|
||
|
||
#define ALLOC_NEW_WORD(table) \
|
||
BUMP_ALLOC ((table)->start, (table)->length, 8, WORD)
|
||
|
||
/* Source text table, and scanning macros. */
|
||
|
||
int number_input_files; /* number of text input files */
|
||
int total_line_count; /* total number of lines seen so far */
|
||
const char **input_file_name; /* array of text input file names */
|
||
int *file_line_count; /* array of `total_line_count' values at end */
|
||
|
||
BLOCK text_buffer; /* file to study */
|
||
char *text_buffer_maxend; /* allocated end of text_buffer */
|
||
|
||
/* SKIP_NON_WHITE used only for getting or skipping the reference. */
|
||
|
||
#define SKIP_NON_WHITE(cursor, limit) \
|
||
while (cursor < limit && !isspace(*cursor)) \
|
||
cursor++
|
||
|
||
#define SKIP_WHITE(cursor, limit) \
|
||
while (cursor < limit && isspace(*cursor)) \
|
||
cursor++
|
||
|
||
#define SKIP_WHITE_BACKWARDS(cursor, start) \
|
||
while (cursor > start && isspace(cursor[-1])) \
|
||
cursor--
|
||
|
||
#define SKIP_SOMETHING(cursor, limit) \
|
||
do \
|
||
if (word_regex_string) \
|
||
{ \
|
||
int count; \
|
||
count = re_match (word_regex, cursor, limit - cursor, 0, NULL); \
|
||
cursor += count <= 0 ? 1 : count; \
|
||
} \
|
||
else if (word_fastmap[(unsigned char) *cursor]) \
|
||
while (cursor < limit && word_fastmap[(unsigned char) *cursor]) \
|
||
cursor++; \
|
||
else \
|
||
cursor++; \
|
||
while (0)
|
||
|
||
/* Occurrences table.
|
||
|
||
The `keyword' pointer provides the central word, which is surrounded
|
||
by a left context and a right context. The `keyword' and `length'
|
||
field allow full 8-bit characters keys, even including NULs. At other
|
||
places in this program, the name `keyafter' refers to the keyword
|
||
followed by its right context.
|
||
|
||
The left context does not extend, towards the beginning of the file,
|
||
further than a distance given by the `left' value. This value is
|
||
relative to the keyword beginning, it is usually negative. This
|
||
insures that, except for white space, we will never have to backward
|
||
scan the source text, when it is time to generate the final output
|
||
lines.
|
||
|
||
The right context, indirectly attainable through the keyword end, does
|
||
not extend, towards the end of the file, further than a distance given
|
||
by the `right' value. This value is relative to the keyword
|
||
beginning, it is usually positive.
|
||
|
||
When automatic references are used, the `reference' value is the
|
||
overall line number in all input files read so far, in this case, it
|
||
is of type (int). When input references are used, the `reference'
|
||
value indicates the distance between the keyword beginning and the
|
||
start of the reference field, it is of type (DELTA) and usually
|
||
negative. */
|
||
|
||
typedef short DELTA; /* to hold displacement within one context */
|
||
|
||
typedef struct
|
||
{
|
||
WORD key; /* description of the keyword */
|
||
DELTA left; /* distance to left context start */
|
||
DELTA right; /* distance to right context end */
|
||
int reference; /* reference descriptor */
|
||
}
|
||
OCCURS;
|
||
|
||
/* The various OCCURS tables are indexed by the language. But the time
|
||
being, there is no such multiple language support. */
|
||
|
||
OCCURS *occurs_table[1]; /* all words retained from the read text */
|
||
size_t number_of_occurs[1]; /* number of used slots in occurs_table */
|
||
|
||
#define ALLOC_NEW_OCCURS(language) \
|
||
BUMP_ALLOC (occurs_table[language], number_of_occurs[language], 9, OCCURS)
|
||
|
||
|
||
/* Communication among output routines. */
|
||
|
||
/* Indicate if special output processing is requested for each character. */
|
||
char edited_flag[CHAR_SET_SIZE];
|
||
|
||
int half_line_width; /* half of line width, reference excluded */
|
||
int before_max_width; /* maximum width of before field */
|
||
int keyafter_max_width; /* maximum width of keyword-and-after field */
|
||
int truncation_string_length; /* length of string used to flag truncation */
|
||
|
||
/* When context is limited by lines, wraparound may happen on final output:
|
||
the `head' pointer gives access to some supplementary left context which
|
||
will be seen at the end of the output line, the `tail' pointer gives
|
||
access to some supplementary right context which will be seen at the
|
||
beginning of the output line. */
|
||
|
||
BLOCK tail; /* tail field */
|
||
int tail_truncation; /* flag truncation after the tail field */
|
||
|
||
BLOCK before; /* before field */
|
||
int before_truncation; /* flag truncation before the before field */
|
||
|
||
BLOCK keyafter; /* keyword-and-after field */
|
||
int keyafter_truncation; /* flag truncation after the keyafter field */
|
||
|
||
BLOCK head; /* head field */
|
||
int head_truncation; /* flag truncation before the head field */
|
||
|
||
BLOCK reference; /* reference field for input reference mode */
|
||
|
||
|
||
/* Miscellaneous routines. */
|
||
|
||
/*------------------------------------------------------.
|
||
| Duplicate string STRING, while evaluating \-escapes. |
|
||
`------------------------------------------------------*/
|
||
|
||
/* Loosely adapted from GNU shellutils printf.c code. */
|
||
|
||
char *
|
||
copy_unescaped_string (const char *string)
|
||
{
|
||
char *result; /* allocated result */
|
||
char *cursor; /* cursor in result */
|
||
int value; /* value of \nnn escape */
|
||
int length; /* length of \nnn escape */
|
||
|
||
result = xmalloc (strlen (string) + 1);
|
||
cursor = result;
|
||
|
||
while (*string)
|
||
if (*string == '\\')
|
||
{
|
||
string++;
|
||
switch (*string)
|
||
{
|
||
case 'x': /* \xhhh escape, 3 chars maximum */
|
||
value = 0;
|
||
for (length = 0, string++;
|
||
length < 3 && ISXDIGIT (*string);
|
||
length++, string++)
|
||
value = value * 16 + HEXTOBIN (*string);
|
||
if (length == 0)
|
||
{
|
||
*cursor++ = '\\';
|
||
*cursor++ = 'x';
|
||
}
|
||
else
|
||
*cursor++ = value;
|
||
break;
|
||
|
||
case '0': /* \0ooo escape, 3 chars maximum */
|
||
value = 0;
|
||
for (length = 0, string++;
|
||
length < 3 && ISODIGIT (*string);
|
||
length++, string++)
|
||
value = value * 8 + OCTTOBIN (*string);
|
||
*cursor++ = value;
|
||
break;
|
||
|
||
case 'a': /* alert */
|
||
#if __STDC__
|
||
*cursor++ = '\a';
|
||
#else
|
||
*cursor++ = 7;
|
||
#endif
|
||
string++;
|
||
break;
|
||
|
||
case 'b': /* backspace */
|
||
*cursor++ = '\b';
|
||
string++;
|
||
break;
|
||
|
||
case 'c': /* cancel the rest of the output */
|
||
while (*string)
|
||
string++;
|
||
break;
|
||
|
||
case 'f': /* form feed */
|
||
*cursor++ = '\f';
|
||
string++;
|
||
break;
|
||
|
||
case 'n': /* new line */
|
||
*cursor++ = '\n';
|
||
string++;
|
||
break;
|
||
|
||
case 'r': /* carriage return */
|
||
*cursor++ = '\r';
|
||
string++;
|
||
break;
|
||
|
||
case 't': /* horizontal tab */
|
||
*cursor++ = '\t';
|
||
string++;
|
||
break;
|
||
|
||
case 'v': /* vertical tab */
|
||
#if __STDC__
|
||
*cursor++ = '\v';
|
||
#else
|
||
*cursor++ = 11;
|
||
#endif
|
||
string++;
|
||
break;
|
||
|
||
default:
|
||
*cursor++ = '\\';
|
||
*cursor++ = *string++;
|
||
break;
|
||
}
|
||
}
|
||
else
|
||
*cursor++ = *string++;
|
||
|
||
*cursor = '\0';
|
||
return result;
|
||
}
|
||
|
||
/*-------------------------------------------------------------------.
|
||
| Compile the regex represented by STRING, diagnose and abort if any |
|
||
| error. Returns the compiled regex structure. |
|
||
`-------------------------------------------------------------------*/
|
||
|
||
struct re_pattern_buffer *
|
||
alloc_and_compile_regex (const char *string)
|
||
{
|
||
struct re_pattern_buffer *pattern; /* newly allocated structure */
|
||
const char *message; /* error message returned by regex.c */
|
||
|
||
pattern = (struct re_pattern_buffer *)
|
||
xmalloc (sizeof (struct re_pattern_buffer));
|
||
memset (pattern, 0, sizeof (struct re_pattern_buffer));
|
||
|
||
pattern->buffer = NULL;
|
||
pattern->allocated = 0;
|
||
pattern->translate = ignore_case ? (char *) folded_chars : NULL;
|
||
pattern->fastmap = (char *) xmalloc (CHAR_SET_SIZE);
|
||
|
||
message = re_compile_pattern (string, strlen (string), pattern);
|
||
if (message)
|
||
error (1, 0, "%s (for regexp `%s')", message, string);
|
||
|
||
/* The fastmap should be compiled before `re_match'. The following
|
||
call is not mandatory, because `re_search' is always called sooner,
|
||
and it compiles the fastmap if this has not been done yet. */
|
||
|
||
re_compile_fastmap (pattern);
|
||
|
||
/* Do not waste extra allocated space. */
|
||
|
||
if (pattern->allocated > pattern->used)
|
||
{
|
||
pattern->buffer
|
||
= (unsigned char *) xrealloc (pattern->buffer, pattern->used);
|
||
pattern->allocated = pattern->used;
|
||
}
|
||
|
||
return pattern;
|
||
}
|
||
|
||
/*------------------------------------------------------------------------.
|
||
| This will initialize various tables for pattern match and compiles some |
|
||
| regexps. |
|
||
`------------------------------------------------------------------------*/
|
||
|
||
void
|
||
initialize_regex (void)
|
||
{
|
||
int character; /* character value */
|
||
|
||
/* Initialize the regex syntax table. */
|
||
|
||
for (character = 0; character < CHAR_SET_SIZE; character++)
|
||
syntax_table[character] = isalpha (character) ? Sword : 0;
|
||
|
||
/* Initialize the case folding table. */
|
||
|
||
if (ignore_case)
|
||
for (character = 0; character < CHAR_SET_SIZE; character++)
|
||
folded_chars[character] = toupper (character);
|
||
|
||
/* Unless the user already provided a description of the end of line or
|
||
end of sentence sequence, select an end of line sequence to compile.
|
||
If the user provided an empty definition, thus disabling end of line
|
||
or sentence feature, make it NULL to speed up tests. If GNU
|
||
extensions are enabled, use end of sentence like in GNU emacs. If
|
||
disabled, use end of lines. */
|
||
|
||
if (context_regex_string)
|
||
{
|
||
if (!*context_regex_string)
|
||
context_regex_string = NULL;
|
||
}
|
||
else if (gnu_extensions && !input_reference)
|
||
context_regex_string = "[.?!][]\"')}]*\\($\\|\t\\| \\)[ \t\n]*";
|
||
else
|
||
context_regex_string = "\n";
|
||
|
||
if (context_regex_string)
|
||
context_regex = alloc_and_compile_regex (context_regex_string);
|
||
|
||
/* If the user has already provided a non-empty regexp to describe
|
||
words, compile it. Else, unless this has already been done through
|
||
a user provided Break character file, construct a fastmap of
|
||
characters that may appear in a word. If GNU extensions enabled,
|
||
include only letters of the underlying character set. If disabled,
|
||
include almost everything, even punctuations; stop only on white
|
||
space. */
|
||
|
||
if (word_regex_string && *word_regex_string)
|
||
word_regex = alloc_and_compile_regex (word_regex_string);
|
||
else if (!break_file)
|
||
if (gnu_extensions)
|
||
{
|
||
|
||
/* Simulate \w+. */
|
||
|
||
for (character = 0; character < CHAR_SET_SIZE; character++)
|
||
word_fastmap[character] = isalpha (character);
|
||
}
|
||
else
|
||
{
|
||
|
||
/* Simulate [^ \t\n]+. */
|
||
|
||
memset (word_fastmap, 1, CHAR_SET_SIZE);
|
||
word_fastmap[' '] = 0;
|
||
word_fastmap['\t'] = 0;
|
||
word_fastmap['\n'] = 0;
|
||
}
|
||
}
|
||
|
||
/*------------------------------------------------------------------------.
|
||
| This routine will attempt to swallow a whole file name FILE_NAME into a |
|
||
| contiguous region of memory and return a description of it into BLOCK. |
|
||
| Standard input is assumed whenever FILE_NAME is NULL, empty or "-". |
|
||
| |
|
||
| Previously, in some cases, white space compression was attempted while |
|
||
| inputting text. This was defeating some regexps like default end of |
|
||
| sentence, which checks for two consecutive spaces. If white space |
|
||
| compression is ever reinstated, it should be in output routines. |
|
||
`------------------------------------------------------------------------*/
|
||
|
||
void
|
||
swallow_file_in_memory (const char *file_name, BLOCK *block)
|
||
{
|
||
int file_handle; /* file descriptor number */
|
||
struct stat stat_block; /* stat block for file */
|
||
int allocated_length; /* allocated length of memory buffer */
|
||
int used_length; /* used length in memory buffer */
|
||
int read_length; /* number of character gotten on last read */
|
||
|
||
/* As special cases, a file name which is NULL or "-" indicates standard
|
||
input, which is already opened. In all other cases, open the file from
|
||
its name. */
|
||
|
||
if (!file_name || !*file_name || strcmp (file_name, "-") == 0)
|
||
file_handle = fileno (stdin);
|
||
else
|
||
if ((file_handle = open (file_name, O_RDONLY)) < 0)
|
||
error (1, errno, file_name);
|
||
|
||
/* If the file is a plain, regular file, allocate the memory buffer all at
|
||
once and swallow the file in one blow. In other cases, read the file
|
||
repeatedly in smaller chunks until we have it all, reallocating memory
|
||
once in a while, as we go. */
|
||
|
||
if (fstat (file_handle, &stat_block) < 0)
|
||
error (1, errno, file_name);
|
||
|
||
if (S_ISREG (stat_block.st_mode))
|
||
{
|
||
block->start = (char *) xmalloc ((int) stat_block.st_size);
|
||
|
||
if (read (file_handle, block->start, (int) stat_block.st_size)
|
||
!= stat_block.st_size)
|
||
error (1, errno, file_name);
|
||
|
||
block->end = block->start + stat_block.st_size;
|
||
}
|
||
else
|
||
{
|
||
block->start = (char *) xmalloc (1 << SWALLOW_REALLOC_LOG);
|
||
used_length = 0;
|
||
allocated_length = (1 << SWALLOW_REALLOC_LOG);
|
||
|
||
while ((read_length = read (file_handle,
|
||
block->start + used_length,
|
||
allocated_length - used_length)) > 0)
|
||
{
|
||
used_length += read_length;
|
||
if (used_length == allocated_length)
|
||
{
|
||
allocated_length += (1 << SWALLOW_REALLOC_LOG);
|
||
block->start
|
||
= (char *) xrealloc (block->start, allocated_length);
|
||
}
|
||
}
|
||
|
||
if (read_length < 0)
|
||
error (1, errno, file_name);
|
||
|
||
block->end = block->start + used_length;
|
||
}
|
||
|
||
/* Close the file, but only if it was not the standard input. */
|
||
|
||
if (file_handle != fileno (stdin))
|
||
close (file_handle);
|
||
}
|
||
|
||
/* Sort and search routines. */
|
||
|
||
/*--------------------------------------------------------------------------.
|
||
| Compare two words, FIRST and SECOND, and return 0 if they are identical. |
|
||
| Return less than 0 if the first word goes before the second; return |
|
||
| greater than 0 if the first word goes after the second. |
|
||
| |
|
||
| If a word is indeed a prefix of the other, the shorter should go first. |
|
||
`--------------------------------------------------------------------------*/
|
||
|
||
int
|
||
compare_words (const void *void_first, const void *void_second)
|
||
{
|
||
#define first ((WORD *) void_first)
|
||
#define second ((WORD *) void_second)
|
||
int length; /* minimum of two lengths */
|
||
int counter; /* cursor in words */
|
||
int value; /* value of comparison */
|
||
|
||
length = first->size < second->size ? first->size : second->size;
|
||
|
||
if (ignore_case)
|
||
{
|
||
for (counter = 0; counter < length; counter++)
|
||
{
|
||
value = (folded_chars [(unsigned char) (first->start[counter])]
|
||
- folded_chars [(unsigned char) (second->start[counter])]);
|
||
if (value != 0)
|
||
return value;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
for (counter = 0; counter < length; counter++)
|
||
{
|
||
value = ((unsigned char) first->start[counter]
|
||
- (unsigned char) second->start[counter]);
|
||
if (value != 0)
|
||
return value;
|
||
}
|
||
}
|
||
|
||
return first->size - second->size;
|
||
#undef first
|
||
#undef second
|
||
}
|
||
|
||
/*-----------------------------------------------------------------------.
|
||
| Decides which of two OCCURS, FIRST or SECOND, should lexicographically |
|
||
| go first. In case of a tie, preserve the original order through a |
|
||
| pointer comparison. |
|
||
`-----------------------------------------------------------------------*/
|
||
|
||
int
|
||
compare_occurs (const void *void_first, const void *void_second)
|
||
{
|
||
#define first ((OCCURS *) void_first)
|
||
#define second ((OCCURS *) void_second)
|
||
int value;
|
||
|
||
value = compare_words (&first->key, &second->key);
|
||
return value == 0 ? first->key.start - second->key.start : value;
|
||
#undef first
|
||
#undef second
|
||
}
|
||
|
||
/*------------------------------------------------------------.
|
||
| Return !0 if WORD appears in TABLE. Uses a binary search. |
|
||
`------------------------------------------------------------*/
|
||
|
||
int
|
||
search_table (WORD *word, WORD_TABLE *table)
|
||
{
|
||
int lowest; /* current lowest possible index */
|
||
int highest; /* current highest possible index */
|
||
int middle; /* current middle index */
|
||
int value; /* value from last comparison */
|
||
|
||
lowest = 0;
|
||
highest = table->length - 1;
|
||
while (lowest <= highest)
|
||
{
|
||
middle = (lowest + highest) / 2;
|
||
value = compare_words (word, table->start + middle);
|
||
if (value < 0)
|
||
highest = middle - 1;
|
||
else if (value > 0)
|
||
lowest = middle + 1;
|
||
else
|
||
return 1;
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
/*---------------------------------------------------------------------.
|
||
| Sort the whole occurs table in memory. Presumably, `qsort' does not |
|
||
| take intermediate copies or table elements, so the sort will be |
|
||
| stabilized throughout the comparison routine. |
|
||
`---------------------------------------------------------------------*/
|
||
|
||
void
|
||
sort_found_occurs (void)
|
||
{
|
||
|
||
/* Only one language for the time being. */
|
||
|
||
qsort (occurs_table[0], number_of_occurs[0], sizeof (OCCURS),
|
||
compare_occurs);
|
||
}
|
||
|
||
/* Parameter files reading routines. */
|
||
|
||
/*----------------------------------------------------------------------.
|
||
| Read a file named FILE_NAME, containing a set of break characters. |
|
||
| Build a content to the array word_fastmap in which all characters are |
|
||
| allowed except those found in the file. Characters may be repeated. |
|
||
`----------------------------------------------------------------------*/
|
||
|
||
void
|
||
digest_break_file (const char *file_name)
|
||
{
|
||
BLOCK file_contents; /* to receive a copy of the file */
|
||
char *cursor; /* cursor in file copy */
|
||
|
||
swallow_file_in_memory (file_name, &file_contents);
|
||
|
||
/* Make the fastmap and record the file contents in it. */
|
||
|
||
memset (word_fastmap, 1, CHAR_SET_SIZE);
|
||
for (cursor = file_contents.start; cursor < file_contents.end; cursor++)
|
||
word_fastmap[(unsigned char) *cursor] = 0;
|
||
|
||
if (!gnu_extensions)
|
||
{
|
||
|
||
/* If GNU extensions are enabled, the only way to avoid newline as
|
||
a break character is to write all the break characters in the
|
||
file with no newline at all, not even at the end of the file.
|
||
If disabled, spaces, tabs and newlines are always considered as
|
||
break characters even if not included in the break file. */
|
||
|
||
word_fastmap[' '] = 0;
|
||
word_fastmap['\t'] = 0;
|
||
word_fastmap['\n'] = 0;
|
||
}
|
||
|
||
/* Return the space of the file, which is no more required. */
|
||
|
||
free (file_contents.start);
|
||
}
|
||
|
||
/*-----------------------------------------------------------------------.
|
||
| Read a file named FILE_NAME, containing one word per line, then |
|
||
| construct in TABLE a table of WORD descriptors for them. The routine |
|
||
| swallows the whole file in memory; this is at the expense of space |
|
||
| needed for newlines, which are useless; however, the reading is fast. |
|
||
`-----------------------------------------------------------------------*/
|
||
|
||
void
|
||
digest_word_file (const char *file_name, WORD_TABLE *table)
|
||
{
|
||
BLOCK file_contents; /* to receive a copy of the file */
|
||
char *cursor; /* cursor in file copy */
|
||
char *word_start; /* start of the current word */
|
||
|
||
swallow_file_in_memory (file_name, &file_contents);
|
||
|
||
table->start = NULL;
|
||
table->length = 0;
|
||
|
||
/* Read the whole file. */
|
||
|
||
cursor = file_contents.start;
|
||
while (cursor < file_contents.end)
|
||
{
|
||
|
||
/* Read one line, and save the word in contains. */
|
||
|
||
word_start = cursor;
|
||
while (cursor < file_contents.end && *cursor != '\n')
|
||
cursor++;
|
||
|
||
/* Record the word in table if it is not empty. */
|
||
|
||
if (cursor > word_start)
|
||
{
|
||
ALLOC_NEW_WORD (table);
|
||
table->start[table->length].start = word_start;
|
||
table->start[table->length].size = cursor - word_start;
|
||
table->length++;
|
||
}
|
||
|
||
/* This test allows for an incomplete line at end of file. */
|
||
|
||
if (cursor < file_contents.end)
|
||
cursor++;
|
||
}
|
||
|
||
/* Finally, sort all the words read. */
|
||
|
||
qsort (table->start, table->length, (size_t) sizeof (WORD), compare_words);
|
||
}
|
||
|
||
|
||
/* Keyword recognition and selection. */
|
||
|
||
/*----------------------------------------------------------------------.
|
||
| For each keyword in the source text, constructs an OCCURS structure. |
|
||
`----------------------------------------------------------------------*/
|
||
|
||
void
|
||
find_occurs_in_text (void)
|
||
{
|
||
char *cursor; /* for scanning the source text */
|
||
char *scan; /* for scanning the source text also */
|
||
char *line_start; /* start of the current input line */
|
||
char *line_scan; /* newlines scanned until this point */
|
||
int reference_length; /* length of reference in input mode */
|
||
WORD possible_key; /* possible key, to ease searches */
|
||
OCCURS *occurs_cursor; /* current OCCURS under construction */
|
||
|
||
char *context_start; /* start of left context */
|
||
char *context_end; /* end of right context */
|
||
char *word_start; /* start of word */
|
||
char *word_end; /* end of word */
|
||
char *next_context_start; /* next start of left context */
|
||
|
||
/* reference_length is always used within `if (input_reference)'.
|
||
However, GNU C diagnoses that it may be used uninitialized. The
|
||
following assignment is merely to shut it up. */
|
||
|
||
reference_length = 0;
|
||
|
||
/* Tracking where lines start is helpful for reference processing. In
|
||
auto reference mode, this allows counting lines. In input reference
|
||
mode, this permits finding the beginning of the references.
|
||
|
||
The first line begins with the file, skip immediately this very first
|
||
reference in input reference mode, to help further rejection any word
|
||
found inside it. Also, unconditionally assigning these variable has
|
||
the happy effect of shutting up lint. */
|
||
|
||
line_start = text_buffer.start;
|
||
line_scan = line_start;
|
||
if (input_reference)
|
||
{
|
||
SKIP_NON_WHITE (line_scan, text_buffer.end);
|
||
reference_length = line_scan - line_start;
|
||
SKIP_WHITE (line_scan, text_buffer.end);
|
||
}
|
||
|
||
/* Process the whole buffer, one line or one sentence at a time. */
|
||
|
||
for (cursor = text_buffer.start;
|
||
cursor < text_buffer.end;
|
||
cursor = next_context_start)
|
||
{
|
||
|
||
/* `context_start' gets initialized before the processing of each
|
||
line, or once for the whole buffer if no end of line or sentence
|
||
sequence separator. */
|
||
|
||
context_start = cursor;
|
||
|
||
/* If a end of line or end of sentence sequence is defined and
|
||
non-empty, `next_context_start' will be recomputed to be the end of
|
||
each line or sentence, before each one is processed. If no such
|
||
sequence, then `next_context_start' is set at the end of the whole
|
||
buffer, which is then considered to be a single line or sentence.
|
||
This test also accounts for the case of an incomplete line or
|
||
sentence at the end of the buffer. */
|
||
|
||
if (context_regex_string
|
||
&& (re_search (context_regex, cursor, text_buffer.end - cursor,
|
||
0, text_buffer.end - cursor, &context_regs)
|
||
>= 0))
|
||
next_context_start = cursor + context_regs.end[0];
|
||
|
||
else
|
||
next_context_start = text_buffer.end;
|
||
|
||
/* Include the separator into the right context, but not any suffix
|
||
white space in this separator; this insures it will be seen in
|
||
output and will not take more space than necessary. */
|
||
|
||
context_end = next_context_start;
|
||
SKIP_WHITE_BACKWARDS (context_end, context_start);
|
||
|
||
/* Read and process a single input line or sentence, one word at a
|
||
time. */
|
||
|
||
while (1)
|
||
{
|
||
if (word_regex)
|
||
|
||
/* If a word regexp has been compiled, use it to skip at the
|
||
beginning of the next word. If there is no such word, exit
|
||
the loop. */
|
||
|
||
{
|
||
if (re_search (word_regex, cursor, context_end - cursor,
|
||
0, context_end - cursor, &word_regs)
|
||
< 0)
|
||
break;
|
||
word_start = cursor + word_regs.start[0];
|
||
word_end = cursor + word_regs.end[0];
|
||
}
|
||
else
|
||
|
||
/* Avoid re_search and use the fastmap to skip to the
|
||
beginning of the next word. If there is no more word in
|
||
the buffer, exit the loop. */
|
||
|
||
{
|
||
scan = cursor;
|
||
while (scan < context_end
|
||
&& !word_fastmap[(unsigned char) *scan])
|
||
scan++;
|
||
|
||
if (scan == context_end)
|
||
break;
|
||
|
||
word_start = scan;
|
||
|
||
while (scan < context_end
|
||
&& word_fastmap[(unsigned char) *scan])
|
||
scan++;
|
||
|
||
word_end = scan;
|
||
}
|
||
|
||
/* Skip right to the beginning of the found word. */
|
||
|
||
cursor = word_start;
|
||
|
||
/* Skip any zero length word. Just advance a single position,
|
||
then go fetch the next word. */
|
||
|
||
if (word_end == word_start)
|
||
{
|
||
cursor++;
|
||
continue;
|
||
}
|
||
|
||
/* This is a genuine, non empty word, so save it as a possible
|
||
key. Then skip over it. Also, maintain the maximum length of
|
||
all words read so far. It is mandatory to take the maximum
|
||
length of all words in the file, without considering if they
|
||
are actually kept or rejected, because backward jumps at output
|
||
generation time may fall in *any* word. */
|
||
|
||
possible_key.start = cursor;
|
||
possible_key.size = word_end - word_start;
|
||
cursor += possible_key.size;
|
||
|
||
if (possible_key.size > maximum_word_length)
|
||
maximum_word_length = possible_key.size;
|
||
|
||
/* In input reference mode, update `line_start' from its previous
|
||
value. Count the lines just in case auto reference mode is
|
||
also selected. If it happens that the word just matched is
|
||
indeed part of a reference; just ignore it. */
|
||
|
||
if (input_reference)
|
||
{
|
||
while (line_scan < possible_key.start)
|
||
if (*line_scan == '\n')
|
||
{
|
||
total_line_count++;
|
||
line_scan++;
|
||
line_start = line_scan;
|
||
SKIP_NON_WHITE (line_scan, text_buffer.end);
|
||
reference_length = line_scan - line_start;
|
||
}
|
||
else
|
||
line_scan++;
|
||
if (line_scan > possible_key.start)
|
||
continue;
|
||
}
|
||
|
||
/* Ignore the word if an `Ignore words' table exists and if it is
|
||
part of it. Also ignore the word if an `Only words' table and
|
||
if it is *not* part of it.
|
||
|
||
It is allowed that both tables be used at once, even if this
|
||
may look strange for now. Just ignore a word that would appear
|
||
in both. If regexps are eventually implemented for these
|
||
tables, the Ignore table could then reject words that would
|
||
have been previously accepted by the Only table. */
|
||
|
||
if (ignore_file && search_table (&possible_key, &ignore_table))
|
||
continue;
|
||
if (only_file && !search_table (&possible_key, &only_table))
|
||
continue;
|
||
|
||
/* A non-empty word has been found. First of all, insure
|
||
proper allocation of the next OCCURS, and make a pointer to
|
||
where it will be constructed. */
|
||
|
||
ALLOC_NEW_OCCURS (0);
|
||
occurs_cursor = occurs_table[0] + number_of_occurs[0];
|
||
|
||
/* Define the refence field, if any. */
|
||
|
||
if (auto_reference)
|
||
{
|
||
|
||
/* While auto referencing, update `line_start' from its
|
||
previous value, counting lines as we go. If input
|
||
referencing at the same time, `line_start' has been
|
||
advanced earlier, and the following loop is never really
|
||
executed. */
|
||
|
||
while (line_scan < possible_key.start)
|
||
if (*line_scan == '\n')
|
||
{
|
||
total_line_count++;
|
||
line_scan++;
|
||
line_start = line_scan;
|
||
SKIP_NON_WHITE (line_scan, text_buffer.end);
|
||
}
|
||
else
|
||
line_scan++;
|
||
|
||
occurs_cursor->reference = total_line_count;
|
||
}
|
||
else if (input_reference)
|
||
{
|
||
|
||
/* If only input referencing, `line_start' has been computed
|
||
earlier to detect the case the word matched would be part
|
||
of the reference. The reference position is simply the
|
||
value of `line_start'. */
|
||
|
||
occurs_cursor->reference
|
||
= (DELTA) (line_start - possible_key.start);
|
||
if (reference_length > reference_max_width)
|
||
reference_max_width = reference_length;
|
||
}
|
||
|
||
/* Exclude the reference from the context in simple cases. */
|
||
|
||
if (input_reference && line_start == context_start)
|
||
{
|
||
SKIP_NON_WHITE (context_start, context_end);
|
||
SKIP_WHITE (context_start, context_end);
|
||
}
|
||
|
||
/* Completes the OCCURS structure. */
|
||
|
||
occurs_cursor->key = possible_key;
|
||
occurs_cursor->left = context_start - possible_key.start;
|
||
occurs_cursor->right = context_end - possible_key.start;
|
||
|
||
number_of_occurs[0]++;
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Formatting and actual output - service routines. */
|
||
|
||
/*-----------------------------------------.
|
||
| Prints some NUMBER of spaces on stdout. |
|
||
`-----------------------------------------*/
|
||
|
||
void
|
||
print_spaces (int number)
|
||
{
|
||
int counter;
|
||
|
||
for (counter = number; counter > 0; counter--)
|
||
putchar (' ');
|
||
}
|
||
|
||
/*-------------------------------------.
|
||
| Prints the field provided by FIELD. |
|
||
`-------------------------------------*/
|
||
|
||
void
|
||
print_field (BLOCK field)
|
||
{
|
||
char *cursor; /* Cursor in field to print */
|
||
int character; /* Current character */
|
||
int base; /* Base character, without diacritic */
|
||
int diacritic; /* Diacritic code for the character */
|
||
|
||
/* Whitespace is not really compressed. Instead, each white space
|
||
character (tab, vt, ht etc.) is printed as one single space. */
|
||
|
||
for (cursor = field.start; cursor < field.end; cursor++)
|
||
{
|
||
character = (unsigned char) *cursor;
|
||
if (edited_flag[character])
|
||
{
|
||
|
||
/* First check if this is a diacriticized character.
|
||
|
||
This works only for TeX. I do not know how diacriticized
|
||
letters work with `roff'. Please someone explain it to me! */
|
||
|
||
diacritic = todiac (character);
|
||
if (diacritic != 0 && output_format == TEX_FORMAT)
|
||
{
|
||
base = tobase (character);
|
||
switch (diacritic)
|
||
{
|
||
|
||
case 1: /* Latin diphthongs */
|
||
switch (base)
|
||
{
|
||
case 'o':
|
||
printf ("\\oe{}");
|
||
break;
|
||
|
||
case 'O':
|
||
printf ("\\OE{}");
|
||
break;
|
||
|
||
case 'a':
|
||
printf ("\\ae{}");
|
||
break;
|
||
|
||
case 'A':
|
||
printf ("\\AE{}");
|
||
break;
|
||
|
||
default:
|
||
putchar (' ');
|
||
}
|
||
break;
|
||
|
||
case 2: /* Acute accent */
|
||
printf ("\\'%s%c", (base == 'i' ? "\\" : ""), base);
|
||
break;
|
||
|
||
case 3: /* Grave accent */
|
||
printf ("\\`%s%c", (base == 'i' ? "\\" : ""), base);
|
||
break;
|
||
|
||
case 4: /* Circumflex accent */
|
||
printf ("\\^%s%c", (base == 'i' ? "\\" : ""), base);
|
||
break;
|
||
|
||
case 5: /* Diaeresis */
|
||
printf ("\\\"%s%c", (base == 'i' ? "\\" : ""), base);
|
||
break;
|
||
|
||
case 6: /* Tilde accent */
|
||
printf ("\\~%s%c", (base == 'i' ? "\\" : ""), base);
|
||
break;
|
||
|
||
case 7: /* Cedilla */
|
||
printf ("\\c{%c}", base);
|
||
break;
|
||
|
||
case 8: /* Small circle beneath */
|
||
switch (base)
|
||
{
|
||
case 'a':
|
||
printf ("\\aa{}");
|
||
break;
|
||
|
||
case 'A':
|
||
printf ("\\AA{}");
|
||
break;
|
||
|
||
default:
|
||
putchar (' ');
|
||
}
|
||
break;
|
||
|
||
case 9: /* Strike through */
|
||
switch (base)
|
||
{
|
||
case 'o':
|
||
printf ("\\o{}");
|
||
break;
|
||
|
||
case 'O':
|
||
printf ("\\O{}");
|
||
break;
|
||
|
||
default:
|
||
putchar (' ');
|
||
}
|
||
break;
|
||
}
|
||
}
|
||
else
|
||
|
||
/* This is not a diacritic character, so handle cases which are
|
||
really specific to `roff' or TeX. All white space processing
|
||
is done as the default case of this switch. */
|
||
|
||
switch (character)
|
||
{
|
||
case '"':
|
||
/* In roff output format, double any quote. */
|
||
putchar ('"');
|
||
putchar ('"');
|
||
break;
|
||
|
||
case '$':
|
||
case '%':
|
||
case '&':
|
||
case '#':
|
||
case '_':
|
||
/* In TeX output format, precede these with a backslash. */
|
||
putchar ('\\');
|
||
putchar (character);
|
||
break;
|
||
|
||
case '{':
|
||
case '}':
|
||
/* In TeX output format, precede these with a backslash and
|
||
force mathematical mode. */
|
||
printf ("$\\%c$", character);
|
||
break;
|
||
|
||
case '\\':
|
||
/* In TeX output mode, request production of a backslash. */
|
||
printf ("\\backslash{}");
|
||
break;
|
||
|
||
default:
|
||
/* Any other flagged character produces a single space. */
|
||
putchar (' ');
|
||
}
|
||
}
|
||
else
|
||
putchar (*cursor);
|
||
}
|
||
}
|
||
|
||
|
||
/* Formatting and actual output - planning routines. */
|
||
|
||
/*--------------------------------------------------------------------.
|
||
| From information collected from command line options and input file |
|
||
| readings, compute and fix some output parameter values. |
|
||
`--------------------------------------------------------------------*/
|
||
|
||
void
|
||
fix_output_parameters (void)
|
||
{
|
||
int file_index; /* index in text input file arrays */
|
||
int line_ordinal; /* line ordinal value for reference */
|
||
char ordinal_string[12]; /* edited line ordinal for reference */
|
||
int reference_width; /* width for the whole reference */
|
||
int character; /* character ordinal */
|
||
const char *cursor; /* cursor in some constant strings */
|
||
|
||
/* In auto reference mode, the maximum width of this field is
|
||
precomputed and subtracted from the overall line width. Add one for
|
||
the column which separate the file name from the line number. */
|
||
|
||
if (auto_reference)
|
||
{
|
||
reference_max_width = 0;
|
||
for (file_index = 0; file_index < number_input_files; file_index++)
|
||
{
|
||
line_ordinal = file_line_count[file_index] + 1;
|
||
if (file_index > 0)
|
||
line_ordinal -= file_line_count[file_index - 1];
|
||
sprintf (ordinal_string, "%d", line_ordinal);
|
||
reference_width = strlen (ordinal_string);
|
||
if (input_file_name[file_index])
|
||
reference_width += strlen (input_file_name[file_index]);
|
||
if (reference_width > reference_max_width)
|
||
reference_max_width = reference_width;
|
||
}
|
||
reference_max_width++;
|
||
reference.start = (char *) xmalloc (reference_max_width + 1);
|
||
}
|
||
|
||
/* If the reference appears to the left of the output line, reserve some
|
||
space for it right away, including one gap size. */
|
||
|
||
if ((auto_reference || input_reference) && !right_reference)
|
||
line_width -= reference_max_width + gap_size;
|
||
|
||
/* The output lines, minimally, will contain from left to right a left
|
||
context, a gap, and a keyword followed by the right context with no
|
||
special intervening gap. Half of the line width is dedicated to the
|
||
left context and the gap, the other half is dedicated to the keyword
|
||
and the right context; these values are computed once and for all here.
|
||
There also are tail and head wrap around fields, used when the keyword
|
||
is near the beginning or the end of the line, or when some long word
|
||
cannot fit in, but leave place from wrapped around shorter words. The
|
||
maximum width of these fields are recomputed separately for each line,
|
||
on a case by case basis. It is worth noting that it cannot happen that
|
||
both the tail and head fields are used at once. */
|
||
|
||
half_line_width = line_width / 2;
|
||
before_max_width = half_line_width - gap_size;
|
||
keyafter_max_width = half_line_width;
|
||
|
||
/* If truncation_string is the empty string, make it NULL to speed up
|
||
tests. In this case, truncation_string_length will never get used, so
|
||
there is no need to set it. */
|
||
|
||
if (truncation_string && *truncation_string)
|
||
truncation_string_length = strlen (truncation_string);
|
||
else
|
||
truncation_string = NULL;
|
||
|
||
if (gnu_extensions)
|
||
{
|
||
|
||
/* When flagging truncation at the left of the keyword, the
|
||
truncation mark goes at the beginning of the before field,
|
||
unless there is a head field, in which case the mark goes at the
|
||
left of the head field. When flagging truncation at the right
|
||
of the keyword, the mark goes at the end of the keyafter field,
|
||
unless there is a tail field, in which case the mark goes at the
|
||
end of the tail field. Only eight combination cases could arise
|
||
for truncation marks:
|
||
|
||
. None.
|
||
. One beginning the before field.
|
||
. One beginning the head field.
|
||
. One ending the keyafter field.
|
||
. One ending the tail field.
|
||
. One beginning the before field, another ending the keyafter field.
|
||
. One ending the tail field, another beginning the before field.
|
||
. One ending the keyafter field, another beginning the head field.
|
||
|
||
So, there is at most two truncation marks, which could appear both
|
||
on the left side of the center of the output line, both on the
|
||
right side, or one on either side. */
|
||
|
||
before_max_width -= 2 * truncation_string_length;
|
||
keyafter_max_width -= 2 * truncation_string_length;
|
||
}
|
||
else
|
||
{
|
||
|
||
/* I never figured out exactly how UNIX' ptx plans the output width
|
||
of its various fields. If GNU extensions are disabled, do not
|
||
try computing the field widths correctly; instead, use the
|
||
following formula, which does not completely imitate UNIX' ptx,
|
||
but almost. */
|
||
|
||
keyafter_max_width -= 2 * truncation_string_length + 1;
|
||
}
|
||
|
||
/* Compute which characters need special output processing. Initialize
|
||
by flagging any white space character. Some systems do not consider
|
||
form feed as a space character, but we do. */
|
||
|
||
for (character = 0; character < CHAR_SET_SIZE; character++)
|
||
edited_flag[character] = isspace (character);
|
||
edited_flag['\f'] = 1;
|
||
|
||
/* Complete the special character flagging according to selected output
|
||
format. */
|
||
|
||
switch (output_format)
|
||
{
|
||
case UNKNOWN_FORMAT:
|
||
/* Should never happen. */
|
||
|
||
case DUMB_FORMAT:
|
||
break;
|
||
|
||
case ROFF_FORMAT:
|
||
|
||
/* `Quote' characters should be doubled. */
|
||
|
||
edited_flag['"'] = 1;
|
||
break;
|
||
|
||
case TEX_FORMAT:
|
||
|
||
/* Various characters need special processing. */
|
||
|
||
for (cursor = "$%&#_{}\\"; *cursor; cursor++)
|
||
edited_flag[*cursor] = 1;
|
||
|
||
/* Any character with 8th bit set will print to a single space, unless
|
||
it is diacriticized. */
|
||
|
||
for (character = 0200; character < CHAR_SET_SIZE; character++)
|
||
edited_flag[character] = todiac (character) != 0;
|
||
break;
|
||
}
|
||
}
|
||
|
||
/*------------------------------------------------------------------.
|
||
| Compute the position and length of all the output fields, given a |
|
||
| pointer to some OCCURS. |
|
||
`------------------------------------------------------------------*/
|
||
|
||
void
|
||
define_all_fields (OCCURS *occurs)
|
||
{
|
||
int tail_max_width; /* allowable width of tail field */
|
||
int head_max_width; /* allowable width of head field */
|
||
char *cursor; /* running cursor in source text */
|
||
char *left_context_start; /* start of left context */
|
||
char *right_context_end; /* end of right context */
|
||
char *left_field_start; /* conservative start for `head'/`before' */
|
||
int file_index; /* index in text input file arrays */
|
||
const char *file_name; /* file name for reference */
|
||
int line_ordinal; /* line ordinal for reference */
|
||
|
||
/* Define `keyafter', start of left context and end of right context.
|
||
`keyafter' starts at the saved position for keyword and extend to the
|
||
right from the end of the keyword, eating separators or full words, but
|
||
not beyond maximum allowed width for `keyafter' field or limit for the
|
||
right context. Suffix spaces will be removed afterwards. */
|
||
|
||
keyafter.start = occurs->key.start;
|
||
keyafter.end = keyafter.start + occurs->key.size;
|
||
left_context_start = keyafter.start + occurs->left;
|
||
right_context_end = keyafter.start + occurs->right;
|
||
|
||
cursor = keyafter.end;
|
||
while (cursor < right_context_end
|
||
&& cursor <= keyafter.start + keyafter_max_width)
|
||
{
|
||
keyafter.end = cursor;
|
||
SKIP_SOMETHING (cursor, right_context_end);
|
||
}
|
||
if (cursor <= keyafter.start + keyafter_max_width)
|
||
keyafter.end = cursor;
|
||
|
||
keyafter_truncation = truncation_string && keyafter.end < right_context_end;
|
||
|
||
SKIP_WHITE_BACKWARDS (keyafter.end, keyafter.start);
|
||
|
||
/* When the left context is wide, it might take some time to catch up from
|
||
the left context boundary to the beginning of the `head' or `before'
|
||
fields. So, in this case, to speed the catchup, we jump back from the
|
||
keyword, using some secure distance, possibly falling in the middle of
|
||
a word. A secure backward jump would be at least half the maximum
|
||
width of a line, plus the size of the longest word met in the whole
|
||
input. We conclude this backward jump by a skip forward of at least
|
||
one word. In this manner, we should not inadvertently accept only part
|
||
of a word. From the reached point, when it will be time to fix the
|
||
beginning of `head' or `before' fields, we will skip forward words or
|
||
delimiters until we get sufficiently near. */
|
||
|
||
if (-occurs->left > half_line_width + maximum_word_length)
|
||
{
|
||
left_field_start
|
||
= keyafter.start - (half_line_width + maximum_word_length);
|
||
SKIP_SOMETHING (left_field_start, keyafter.start);
|
||
}
|
||
else
|
||
left_field_start = keyafter.start + occurs->left;
|
||
|
||
/* `before' certainly ends at the keyword, but not including separating
|
||
spaces. It starts after than the saved value for the left context, by
|
||
advancing it until it falls inside the maximum allowed width for the
|
||
before field. There will be no prefix spaces either. `before' only
|
||
advances by skipping single separators or whole words. */
|
||
|
||
before.start = left_field_start;
|
||
before.end = keyafter.start;
|
||
SKIP_WHITE_BACKWARDS (before.end, before.start);
|
||
|
||
while (before.start + before_max_width < before.end)
|
||
SKIP_SOMETHING (before.start, before.end);
|
||
|
||
if (truncation_string)
|
||
{
|
||
cursor = before.start;
|
||
SKIP_WHITE_BACKWARDS (cursor, text_buffer.start);
|
||
before_truncation = cursor > left_context_start;
|
||
}
|
||
else
|
||
before_truncation = 0;
|
||
|
||
SKIP_WHITE (before.start, text_buffer.end);
|
||
|
||
/* The tail could not take more columns than what has been left in the
|
||
left context field, and a gap is mandatory. It starts after the
|
||
right context, and does not contain prefixed spaces. It ends at
|
||
the end of line, the end of buffer or when the tail field is full,
|
||
whichever comes first. It cannot contain only part of a word, and
|
||
has no suffixed spaces. */
|
||
|
||
tail_max_width
|
||
= before_max_width - (before.end - before.start) - gap_size;
|
||
|
||
if (tail_max_width > 0)
|
||
{
|
||
tail.start = keyafter.end;
|
||
SKIP_WHITE (tail.start, text_buffer.end);
|
||
|
||
tail.end = tail.start;
|
||
cursor = tail.end;
|
||
while (cursor < right_context_end
|
||
&& cursor < tail.start + tail_max_width)
|
||
{
|
||
tail.end = cursor;
|
||
SKIP_SOMETHING (cursor, right_context_end);
|
||
}
|
||
|
||
if (cursor < tail.start + tail_max_width)
|
||
tail.end = cursor;
|
||
|
||
if (tail.end > tail.start)
|
||
{
|
||
keyafter_truncation = 0;
|
||
tail_truncation = truncation_string && tail.end < right_context_end;
|
||
}
|
||
else
|
||
tail_truncation = 0;
|
||
|
||
SKIP_WHITE_BACKWARDS (tail.end, tail.start);
|
||
}
|
||
else
|
||
{
|
||
|
||
/* No place left for a tail field. */
|
||
|
||
tail.start = NULL;
|
||
tail.end = NULL;
|
||
tail_truncation = 0;
|
||
}
|
||
|
||
/* `head' could not take more columns than what has been left in the right
|
||
context field, and a gap is mandatory. It ends before the left
|
||
context, and does not contain suffixed spaces. Its pointer is advanced
|
||
until the head field has shrunk to its allowed width. It cannot
|
||
contain only part of a word, and has no suffixed spaces. */
|
||
|
||
head_max_width
|
||
= keyafter_max_width - (keyafter.end - keyafter.start) - gap_size;
|
||
|
||
if (head_max_width > 0)
|
||
{
|
||
head.end = before.start;
|
||
SKIP_WHITE_BACKWARDS (head.end, text_buffer.start);
|
||
|
||
head.start = left_field_start;
|
||
while (head.start + head_max_width < head.end)
|
||
SKIP_SOMETHING (head.start, head.end);
|
||
|
||
if (head.end > head.start)
|
||
{
|
||
before_truncation = 0;
|
||
head_truncation = (truncation_string
|
||
&& head.start > left_context_start);
|
||
}
|
||
else
|
||
head_truncation = 0;
|
||
|
||
SKIP_WHITE (head.start, head.end);
|
||
}
|
||
else
|
||
{
|
||
|
||
/* No place left for a head field. */
|
||
|
||
head.start = NULL;
|
||
head.end = NULL;
|
||
head_truncation = 0;
|
||
}
|
||
|
||
if (auto_reference)
|
||
{
|
||
|
||
/* Construct the reference text in preallocated space from the file
|
||
name and the line number. Find out in which file the reference
|
||
occurred. Standard input yields an empty file name. Insure line
|
||
numbers are one based, even if they are computed zero based. */
|
||
|
||
file_index = 0;
|
||
while (file_line_count[file_index] < occurs->reference)
|
||
file_index++;
|
||
|
||
file_name = input_file_name[file_index];
|
||
if (!file_name)
|
||
file_name = "";
|
||
|
||
line_ordinal = occurs->reference + 1;
|
||
if (file_index > 0)
|
||
line_ordinal -= file_line_count[file_index - 1];
|
||
|
||
sprintf (reference.start, "%s:%d", file_name, line_ordinal);
|
||
reference.end = reference.start + strlen (reference.start);
|
||
}
|
||
else if (input_reference)
|
||
{
|
||
|
||
/* Reference starts at saved position for reference and extends right
|
||
until some white space is met. */
|
||
|
||
reference.start = keyafter.start + (DELTA) occurs->reference;
|
||
reference.end = reference.start;
|
||
SKIP_NON_WHITE (reference.end, right_context_end);
|
||
}
|
||
}
|
||
|
||
|
||
/* Formatting and actual output - control routines. */
|
||
|
||
/*----------------------------------------------------------------------.
|
||
| Output the current output fields as one line for `troff' or `nroff'. |
|
||
`----------------------------------------------------------------------*/
|
||
|
||
void
|
||
output_one_roff_line (void)
|
||
{
|
||
/* Output the `tail' field. */
|
||
|
||
printf (".%s \"", macro_name);
|
||
print_field (tail);
|
||
if (tail_truncation)
|
||
printf ("%s", truncation_string);
|
||
putchar ('"');
|
||
|
||
/* Output the `before' field. */
|
||
|
||
printf (" \"");
|
||
if (before_truncation)
|
||
printf ("%s", truncation_string);
|
||
print_field (before);
|
||
putchar ('"');
|
||
|
||
/* Output the `keyafter' field. */
|
||
|
||
printf (" \"");
|
||
print_field (keyafter);
|
||
if (keyafter_truncation)
|
||
printf ("%s", truncation_string);
|
||
putchar ('"');
|
||
|
||
/* Output the `head' field. */
|
||
|
||
printf (" \"");
|
||
if (head_truncation)
|
||
printf ("%s", truncation_string);
|
||
print_field (head);
|
||
putchar ('"');
|
||
|
||
/* Conditionally output the `reference' field. */
|
||
|
||
if (auto_reference || input_reference)
|
||
{
|
||
printf (" \"");
|
||
print_field (reference);
|
||
putchar ('"');
|
||
}
|
||
|
||
putchar ('\n');
|
||
}
|
||
|
||
/*---------------------------------------------------------.
|
||
| Output the current output fields as one line for `TeX'. |
|
||
`---------------------------------------------------------*/
|
||
|
||
void
|
||
output_one_tex_line (void)
|
||
{
|
||
BLOCK key; /* key field, isolated */
|
||
BLOCK after; /* after field, isolated */
|
||
char *cursor; /* running cursor in source text */
|
||
|
||
printf ("\\%s ", macro_name);
|
||
printf ("{");
|
||
print_field (tail);
|
||
printf ("}{");
|
||
print_field (before);
|
||
printf ("}{");
|
||
key.start = keyafter.start;
|
||
after.end = keyafter.end;
|
||
cursor = keyafter.start;
|
||
SKIP_SOMETHING (cursor, keyafter.end);
|
||
key.end = cursor;
|
||
after.start = cursor;
|
||
print_field (key);
|
||
printf ("}{");
|
||
print_field (after);
|
||
printf ("}{");
|
||
print_field (head);
|
||
printf ("}");
|
||
if (auto_reference || input_reference)
|
||
{
|
||
printf ("{");
|
||
print_field (reference);
|
||
printf ("}");
|
||
}
|
||
printf ("\n");
|
||
}
|
||
|
||
/*-------------------------------------------------------------------.
|
||
| Output the current output fields as one line for a dumb terminal. |
|
||
`-------------------------------------------------------------------*/
|
||
|
||
void
|
||
output_one_dumb_line (void)
|
||
{
|
||
if (!right_reference)
|
||
if (auto_reference)
|
||
{
|
||
|
||
/* Output the `reference' field, in such a way that GNU emacs
|
||
next-error will handle it. The ending colon is taken from the
|
||
gap which follows. */
|
||
|
||
print_field (reference);
|
||
putchar (':');
|
||
print_spaces (reference_max_width
|
||
+ gap_size
|
||
- (reference.end - reference.start)
|
||
- 1);
|
||
}
|
||
else
|
||
{
|
||
|
||
/* Output the `reference' field and its following gap. */
|
||
|
||
print_field (reference);
|
||
print_spaces (reference_max_width
|
||
+ gap_size
|
||
- (reference.end - reference.start));
|
||
}
|
||
|
||
if (tail.start < tail.end)
|
||
{
|
||
/* Output the `tail' field. */
|
||
|
||
print_field (tail);
|
||
if (tail_truncation)
|
||
printf ("%s", truncation_string);
|
||
|
||
print_spaces (half_line_width - gap_size
|
||
- (before.end - before.start)
|
||
- (before_truncation ? truncation_string_length : 0)
|
||
- (tail.end - tail.start)
|
||
- (tail_truncation ? truncation_string_length : 0));
|
||
}
|
||
else
|
||
print_spaces (half_line_width - gap_size
|
||
- (before.end - before.start)
|
||
- (before_truncation ? truncation_string_length : 0));
|
||
|
||
/* Output the `before' field. */
|
||
|
||
if (before_truncation)
|
||
printf ("%s", truncation_string);
|
||
print_field (before);
|
||
|
||
print_spaces (gap_size);
|
||
|
||
/* Output the `keyafter' field. */
|
||
|
||
print_field (keyafter);
|
||
if (keyafter_truncation)
|
||
printf ("%s", truncation_string);
|
||
|
||
if (head.start < head.end)
|
||
{
|
||
/* Output the `head' field. */
|
||
|
||
print_spaces (half_line_width
|
||
- (keyafter.end - keyafter.start)
|
||
- (keyafter_truncation ? truncation_string_length : 0)
|
||
- (head.end - head.start)
|
||
- (head_truncation ? truncation_string_length : 0));
|
||
if (head_truncation)
|
||
printf ("%s", truncation_string);
|
||
print_field (head);
|
||
}
|
||
else
|
||
|
||
if ((auto_reference || input_reference) && right_reference)
|
||
print_spaces (half_line_width
|
||
- (keyafter.end - keyafter.start)
|
||
- (keyafter_truncation ? truncation_string_length : 0));
|
||
|
||
if ((auto_reference || input_reference) && right_reference)
|
||
{
|
||
/* Output the `reference' field. */
|
||
|
||
print_spaces (gap_size);
|
||
print_field (reference);
|
||
}
|
||
|
||
printf ("\n");
|
||
}
|
||
|
||
/*------------------------------------------------------------------------.
|
||
| Scan the whole occurs table and, for each entry, output one line in the |
|
||
| appropriate format. |
|
||
`------------------------------------------------------------------------*/
|
||
|
||
void
|
||
generate_all_output (void)
|
||
{
|
||
int occurs_index; /* index of keyword entry being processed */
|
||
OCCURS *occurs_cursor; /* current keyword entry being processed */
|
||
|
||
|
||
/* The following assignments are useful to provide default values in case
|
||
line contexts or references are not used, in which case these variables
|
||
would never be computed. */
|
||
|
||
tail.start = NULL;
|
||
tail.end = NULL;
|
||
tail_truncation = 0;
|
||
|
||
head.start = NULL;
|
||
head.end = NULL;
|
||
head_truncation = 0;
|
||
|
||
|
||
/* Loop over all keyword occurrences. */
|
||
|
||
occurs_cursor = occurs_table[0];
|
||
|
||
for (occurs_index = 0; occurs_index < number_of_occurs[0]; occurs_index++)
|
||
{
|
||
/* Compute the exact size of every field and whenever truncation flags
|
||
are present or not. */
|
||
|
||
define_all_fields (occurs_cursor);
|
||
|
||
/* Produce one output line according to selected format. */
|
||
|
||
switch (output_format)
|
||
{
|
||
case UNKNOWN_FORMAT:
|
||
/* Should never happen. */
|
||
|
||
case DUMB_FORMAT:
|
||
output_one_dumb_line ();
|
||
break;
|
||
|
||
case ROFF_FORMAT:
|
||
output_one_roff_line ();
|
||
break;
|
||
|
||
case TEX_FORMAT:
|
||
output_one_tex_line ();
|
||
break;
|
||
}
|
||
|
||
/* Advance the cursor into the occurs table. */
|
||
|
||
occurs_cursor++;
|
||
}
|
||
}
|
||
|
||
/* Option decoding and main program. */
|
||
|
||
/*------------------------------------------------------.
|
||
| Print program identification and options, then exit. |
|
||
`------------------------------------------------------*/
|
||
|
||
void
|
||
usage (int status)
|
||
{
|
||
if (status != 0)
|
||
fprintf (stderr, "Try `%s --help' for more information.\n", program_name);
|
||
else
|
||
{
|
||
printf ("\
|
||
Usage: %s [OPTION]... [INPUT]... (without -G)\n\
|
||
or: %s -G [OPTION]... [INPUT [OUTPUT]]\n", program_name, program_name);
|
||
printf ("\
|
||
\n\
|
||
-A, --auto-reference output automatically generated references\n\
|
||
-C, --copyright display Copyright and copying conditions\n\
|
||
-G, --traditional behave more like System V `ptx'\n\
|
||
-F, --flag-truncation=STRING use STRING for flagging line truncations\n\
|
||
-M, --macro-name=STRING macro name to use instead of `xx'\n\
|
||
-O, --format=roff generate output as roff directives\n\
|
||
-R, --right-side-refs put references at right, not counted in -w\n\
|
||
-S, --sentence-regexp=REGEXP for end of lines or end of sentences\n\
|
||
-T, --format=tex generate output as TeX directives\n\
|
||
-W, --word-regexp=REGEXP use REGEXP to match each keyword\n\
|
||
-b, --break-file=FILE word break characters in this FILE\n\
|
||
-f, --ignore-case fold lower case to upper case for sorting\n\
|
||
-g, --gap-size=NUMBER gap size in columns between output fields\n\
|
||
-i, --ignore-file=FILE read ignore word list from FILE\n\
|
||
-o, --only-file=FILE read only word list from this FILE\n\
|
||
-r, --references first field of each line is a reference\n\
|
||
-t, --typeset-mode - not implemented -\n\
|
||
-w, --width=NUMBER output width in columns, reference excluded\n\
|
||
--help display this help and exit\n\
|
||
--version output version information and exit\n\
|
||
\n\
|
||
With no FILE or if FILE is -, read Standard Input. `-F /' by default.\n");
|
||
}
|
||
exit (status);
|
||
}
|
||
|
||
/*----------------------------------------------------------------------.
|
||
| Main program. Decode ARGC arguments passed through the ARGV array of |
|
||
| strings, then launch execution. |
|
||
`----------------------------------------------------------------------*/
|
||
|
||
/* Long options equivalences. */
|
||
const struct option long_options[] =
|
||
{
|
||
{"auto-reference", no_argument, NULL, 'A'},
|
||
{"break-file", required_argument, NULL, 'b'},
|
||
{"copyright", no_argument, NULL, 'C'},
|
||
{"flag-truncation", required_argument, NULL, 'F'},
|
||
{"ignore-case", no_argument, NULL, 'f'},
|
||
{"gap-size", required_argument, NULL, 'g'},
|
||
{"help", no_argument, &show_help, 1},
|
||
{"ignore-file", required_argument, NULL, 'i'},
|
||
{"macro-name", required_argument, NULL, 'M'},
|
||
{"only-file", required_argument, NULL, 'o'},
|
||
{"references", no_argument, NULL, 'r'},
|
||
{"right-side-refs", no_argument, NULL, 'R'},
|
||
{"format", required_argument, NULL, 10},
|
||
{"sentence-regexp", required_argument, NULL, 'S'},
|
||
{"traditional", no_argument, NULL, 'G'},
|
||
{"typeset-mode", no_argument, NULL, 't'},
|
||
{"version", no_argument, &show_version, 1},
|
||
{"width", required_argument, NULL, 'w'},
|
||
{"word-regexp", required_argument, NULL, 'W'},
|
||
{0, 0, 0, 0},
|
||
};
|
||
|
||
static char const* const format_args[] =
|
||
{
|
||
"roff", "tex", 0
|
||
};
|
||
|
||
int
|
||
main (int argc, char *const argv[])
|
||
{
|
||
int optchar; /* argument character */
|
||
extern int optind; /* index of argument */
|
||
extern char *optarg; /* value or argument */
|
||
int file_index; /* index in text input file arrays */
|
||
|
||
#ifdef HAVE_MCHECK
|
||
/* Use GNU malloc checking. It has proven to be useful! */
|
||
mcheck ();
|
||
#endif /* HAVE_MCHECK */
|
||
|
||
#ifdef STDC_HEADERS
|
||
#ifdef HAVE_SETCHRCLASS
|
||
setchrclass (NULL);
|
||
#endif
|
||
#endif
|
||
|
||
/* Decode program options. */
|
||
|
||
program_name = argv[0];
|
||
|
||
while ((optchar = getopt_long (argc, argv, "ACF:GM:ORS:TW:b:i:fg:o:trw:",
|
||
long_options, NULL)),
|
||
optchar != EOF)
|
||
{
|
||
switch (optchar)
|
||
{
|
||
default:
|
||
usage (1);
|
||
|
||
case 0:
|
||
break;
|
||
|
||
case 'C':
|
||
printf ("%s", copyright);
|
||
exit (0);
|
||
|
||
case 'G':
|
||
gnu_extensions = 0;
|
||
break;
|
||
|
||
case 'b':
|
||
break_file = optarg;
|
||
break;
|
||
|
||
case 'f':
|
||
ignore_case = 1;
|
||
break;
|
||
|
||
case 'g':
|
||
gap_size = atoi (optarg);
|
||
break;
|
||
|
||
case 'i':
|
||
ignore_file = optarg;
|
||
break;
|
||
|
||
case 'o':
|
||
only_file = optarg;
|
||
break;
|
||
|
||
case 'r':
|
||
input_reference = 1;
|
||
break;
|
||
|
||
case 't':
|
||
/* A decouvrir... */
|
||
break;
|
||
|
||
case 'w':
|
||
line_width = atoi (optarg);
|
||
break;
|
||
|
||
case 'A':
|
||
auto_reference = 1;
|
||
break;
|
||
|
||
case 'F':
|
||
truncation_string = copy_unescaped_string (optarg);
|
||
break;
|
||
|
||
case 'M':
|
||
macro_name = optarg;
|
||
break;
|
||
|
||
case 'O':
|
||
output_format = ROFF_FORMAT;
|
||
break;
|
||
|
||
case 'R':
|
||
right_reference = 1;
|
||
break;
|
||
|
||
case 'S':
|
||
context_regex_string = copy_unescaped_string (optarg);
|
||
break;
|
||
|
||
case 'T':
|
||
output_format = TEX_FORMAT;
|
||
break;
|
||
|
||
case 'W':
|
||
word_regex_string = copy_unescaped_string (optarg);
|
||
break;
|
||
|
||
case 10:
|
||
switch (argmatch (optarg, format_args))
|
||
{
|
||
default:
|
||
usage (1);
|
||
|
||
case 0:
|
||
output_format = ROFF_FORMAT;
|
||
break;
|
||
|
||
case 1:
|
||
output_format = TEX_FORMAT;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Process trivial options. */
|
||
|
||
if (show_help)
|
||
usage (0);
|
||
|
||
if (show_version)
|
||
{
|
||
printf ("%s\n", version_string);
|
||
exit (0);
|
||
}
|
||
|
||
/* Change the default Ignore file if one is defined. */
|
||
|
||
#ifdef DEFAULT_IGNORE_FILE
|
||
if (!ignore_file)
|
||
ignore_file = DEFAULT_IGNORE_FILE;
|
||
#endif
|
||
|
||
/* Process remaining arguments. If GNU extensions are enabled, process
|
||
all arguments as input parameters. If disabled, accept at most two
|
||
arguments, the second of which is an output parameter. */
|
||
|
||
if (optind == argc)
|
||
{
|
||
|
||
/* No more argument simply means: read standard input. */
|
||
|
||
input_file_name = (const char **) xmalloc (sizeof (const char *));
|
||
file_line_count = (int *) xmalloc (sizeof (int));
|
||
number_input_files = 1;
|
||
input_file_name[0] = NULL;
|
||
}
|
||
else if (gnu_extensions)
|
||
{
|
||
number_input_files = argc - optind;
|
||
input_file_name
|
||
= (const char **) xmalloc (number_input_files * sizeof (const char *));
|
||
file_line_count
|
||
= (int *) xmalloc (number_input_files * sizeof (int));
|
||
|
||
for (file_index = 0; file_index < number_input_files; file_index++)
|
||
{
|
||
input_file_name[file_index] = argv[optind];
|
||
if (!*argv[optind] || strcmp (argv[optind], "-") == 0)
|
||
input_file_name[0] = NULL;
|
||
else
|
||
input_file_name[0] = argv[optind];
|
||
optind++;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
|
||
/* There is one necessary input file. */
|
||
|
||
number_input_files = 1;
|
||
input_file_name = (const char **) xmalloc (sizeof (const char *));
|
||
file_line_count = (int *) xmalloc (sizeof (int));
|
||
if (!*argv[optind] || strcmp (argv[optind], "-") == 0)
|
||
input_file_name[0] = NULL;
|
||
else
|
||
input_file_name[0] = argv[optind];
|
||
optind++;
|
||
|
||
/* Redirect standard output, only if requested. */
|
||
|
||
if (optind < argc)
|
||
{
|
||
fclose (stdout);
|
||
if (fopen (argv[optind], "w") == NULL)
|
||
error (1, errno, argv[optind]);
|
||
optind++;
|
||
}
|
||
|
||
/* Diagnose any other argument as an error. */
|
||
|
||
if (optind < argc)
|
||
usage (1);
|
||
}
|
||
|
||
/* If the output format has not been explicitly selected, choose dumb
|
||
terminal format if GNU extensions are enabled, else `roff' format. */
|
||
|
||
if (output_format == UNKNOWN_FORMAT)
|
||
output_format = gnu_extensions ? DUMB_FORMAT : ROFF_FORMAT;
|
||
|
||
/* Initialize the main tables. */
|
||
|
||
initialize_regex ();
|
||
|
||
/* Read `Break character' file, if any. */
|
||
|
||
if (break_file)
|
||
digest_break_file (break_file);
|
||
|
||
/* Read `Ignore words' file and `Only words' files, if any. If any of
|
||
these files is empty, reset the name of the file to NULL, to avoid
|
||
unnecessary calls to search_table. */
|
||
|
||
if (ignore_file)
|
||
{
|
||
digest_word_file (ignore_file, &ignore_table);
|
||
if (ignore_table.length == 0)
|
||
ignore_file = NULL;
|
||
}
|
||
|
||
if (only_file)
|
||
{
|
||
digest_word_file (only_file, &only_table);
|
||
if (only_table.length == 0)
|
||
only_file = NULL;
|
||
}
|
||
|
||
/* Prepare to study all the input files. */
|
||
|
||
number_of_occurs[0] = 0;
|
||
total_line_count = 0;
|
||
maximum_word_length = 0;
|
||
reference_max_width = 0;
|
||
|
||
for (file_index = 0; file_index < number_input_files; file_index++)
|
||
{
|
||
|
||
/* Read the file in core, than study it. */
|
||
|
||
swallow_file_in_memory (input_file_name[file_index], &text_buffer);
|
||
find_occurs_in_text ();
|
||
|
||
/* Maintain for each file how many lines has been read so far when its
|
||
end is reached. Incrementing the count first is a simple kludge to
|
||
handle a possible incomplete line at end of file. */
|
||
|
||
total_line_count++;
|
||
file_line_count[file_index] = total_line_count;
|
||
}
|
||
|
||
/* Do the output process phase. */
|
||
|
||
sort_found_occurs ();
|
||
fix_output_parameters ();
|
||
generate_all_output ();
|
||
|
||
/* All done. */
|
||
|
||
exit (0);
|
||
}
|