504 lines
11 KiB
C++
504 lines
11 KiB
C++
// -*- C++ -*-
|
|
/* Copyright (C) 1989, 1990, 1991, 1992 Free Software Foundation, Inc.
|
|
Written by James Clark (jjc@jclark.com)
|
|
|
|
This file is part of groff.
|
|
|
|
groff is free software; you can redistribute it and/or modify it under
|
|
the terms of the GNU General Public License as published by the Free
|
|
Software Foundation; either version 2, or (at your option) any later
|
|
version.
|
|
|
|
groff is distributed in the hope that it will be useful, but WITHOUT ANY
|
|
WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
for more details.
|
|
|
|
You should have received a copy of the GNU General Public License along
|
|
with groff; see the file COPYING. If not, write to the Free Software
|
|
Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */
|
|
|
|
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#include <assert.h>
|
|
#include <errno.h>
|
|
|
|
#include "posix.h"
|
|
#include "lib.h"
|
|
#include "errarg.h"
|
|
#include "error.h"
|
|
#include "cset.h"
|
|
#include "cmap.h"
|
|
#include "nonposix.h"
|
|
|
|
#include "refid.h"
|
|
#include "search.h"
|
|
|
|
class file_buffer {
|
|
char *buffer;
|
|
char *bufend;
|
|
public:
|
|
file_buffer();
|
|
~file_buffer();
|
|
int load(int fd, const char *filename);
|
|
const char *get_start() const;
|
|
const char *get_end() const;
|
|
};
|
|
|
|
typedef unsigned char uchar;
|
|
|
|
static uchar map[256];
|
|
static uchar inv_map[256][3];
|
|
|
|
struct map_init {
|
|
map_init();
|
|
};
|
|
|
|
static map_init the_map_init;
|
|
|
|
map_init::map_init()
|
|
{
|
|
int i;
|
|
for (i = 0; i < 256; i++)
|
|
map[i] = csalnum(i) ? cmlower(i) : '\0';
|
|
for (i = 0; i < 256; i++) {
|
|
if (cslower(i)) {
|
|
inv_map[i][0] = i;
|
|
inv_map[i][1] = cmupper(i);
|
|
inv_map[i][2] = '\0';
|
|
}
|
|
else if (csdigit(i)) {
|
|
inv_map[i][0] = i;
|
|
inv_map[i][1] = 0;
|
|
}
|
|
else
|
|
inv_map[i][0] = '\0';
|
|
}
|
|
}
|
|
|
|
|
|
class bmpattern {
|
|
char *pat;
|
|
int len;
|
|
int delta[256];
|
|
public:
|
|
bmpattern(const char *pattern, int pattern_length);
|
|
~bmpattern();
|
|
const char *search(const char *p, const char *end) const;
|
|
int length() const;
|
|
};
|
|
|
|
bmpattern::bmpattern(const char *pattern, int pattern_length)
|
|
: len(pattern_length)
|
|
{
|
|
pat = new char[len];
|
|
int i;
|
|
for (i = 0; i < len; i++)
|
|
pat[i] = map[uchar(pattern[i])];
|
|
for (i = 0; i < 256; i++)
|
|
delta[i] = len;
|
|
for (i = 0; i < len; i++)
|
|
for (const unsigned char *inv = inv_map[uchar(pat[i])]; *inv; inv++)
|
|
delta[*inv] = len - i - 1;
|
|
}
|
|
|
|
const char *bmpattern::search(const char *buf, const char *end) const
|
|
{
|
|
int buflen = end - buf;
|
|
if (len > buflen)
|
|
return 0;
|
|
const char *strend;
|
|
if (buflen > len*4)
|
|
strend = end - len*4;
|
|
else
|
|
strend = buf;
|
|
const char *k = buf + len - 1;
|
|
const int *del = delta;
|
|
const char *pattern = pat;
|
|
for (;;) {
|
|
while (k < strend) {
|
|
int t = del[uchar(*k)];
|
|
if (!t)
|
|
break;
|
|
k += t;
|
|
k += del[uchar(*k)];
|
|
k += del[uchar(*k)];
|
|
}
|
|
while (k < end && del[uchar(*k)] != 0)
|
|
k++;
|
|
if (k == end)
|
|
break;
|
|
int j = len - 1;
|
|
const char *s = k;
|
|
for (;;) {
|
|
if (j == 0)
|
|
return s;
|
|
if (map[uchar(*--s)] != uchar(pattern[--j]))
|
|
break;
|
|
}
|
|
k++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
bmpattern::~bmpattern()
|
|
{
|
|
a_delete pat;
|
|
}
|
|
|
|
inline int bmpattern::length() const
|
|
{
|
|
return len;
|
|
}
|
|
|
|
|
|
static const char *find_end(const char *bufend, const char *p);
|
|
|
|
const char *linear_searcher::search_and_check(const bmpattern *key,
|
|
const char *buf, const char *bufend, const char **start) const
|
|
{
|
|
assert(buf[-1] == '\n');
|
|
assert(bufend[-1] == '\n');
|
|
const char *ptr = buf;
|
|
for (;;) {
|
|
const char *found = key->search(ptr, bufend);
|
|
if (!found)
|
|
break;
|
|
if (check_match(buf, bufend, found, key->length(), &ptr, start))
|
|
return found;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static const char *skip_field(const char *end, const char *p)
|
|
{
|
|
for (;;)
|
|
if (*p++ == '\n') {
|
|
if (p == end || *p == '%')
|
|
break;
|
|
const char *q;
|
|
for (q = p; *q == ' ' || *q == '\t'; q++)
|
|
;
|
|
if (*q == '\n')
|
|
break;
|
|
p = q + 1;
|
|
}
|
|
return p;
|
|
}
|
|
|
|
static const char *find_end(const char *bufend, const char *p)
|
|
{
|
|
for (;;)
|
|
if (*p++ == '\n') {
|
|
if (p == bufend)
|
|
break;
|
|
const char *q;
|
|
for (q = p; *q == ' ' || *q == '\t'; q++)
|
|
;
|
|
if (*q == '\n')
|
|
break;
|
|
p = q + 1;
|
|
}
|
|
return p;
|
|
}
|
|
|
|
|
|
int linear_searcher::check_match(const char *buf, const char *bufend,
|
|
const char *match, int matchlen,
|
|
const char **cont, const char **start) const
|
|
{
|
|
*cont = match + 1;
|
|
// The user is required to supply only the first truncate_len characters
|
|
// of the key. If truncate_len <= 0, he must supply all the key.
|
|
if ((truncate_len <= 0 || matchlen < truncate_len)
|
|
&& map[uchar(match[matchlen])] != '\0')
|
|
return 0;
|
|
|
|
// The character before the match must not be an alphanumeric
|
|
// character (unless the alphanumeric character follows one or two
|
|
// percent characters at the beginning of the line), nor must it be
|
|
// a percent character at the beginning of a line, nor a percent
|
|
// character following a percent character at the beginning of a
|
|
// line.
|
|
|
|
switch (match - buf) {
|
|
case 0:
|
|
break;
|
|
case 1:
|
|
if (match[-1] == '%' || map[uchar(match[-1])] != '\0')
|
|
return 0;
|
|
break;
|
|
case 2:
|
|
if (map[uchar(match[-1])] != '\0' && match[-2] != '%')
|
|
return 0;
|
|
if (match[-1] == '%'
|
|
&& (match[-2] == '\n' || match[-2] == '%'))
|
|
return 0;
|
|
break;
|
|
default:
|
|
if (map[uchar(match[-1])] != '\0'
|
|
&& !(match[-2] == '%'
|
|
&& (match[-3] == '\n'
|
|
|| (match[-3] == '%' && match[-4] == '\n'))))
|
|
return 0;
|
|
if (match[-1] == '%'
|
|
&& (match[-2] == '\n'
|
|
|| (match[-2] == '%' && match[-3] == '\n')))
|
|
return 0;
|
|
}
|
|
|
|
const char *p = match;
|
|
int had_percent = 0;
|
|
for (;;) {
|
|
if (*p == '\n') {
|
|
if (!had_percent && p[1] == '%') {
|
|
if (p[2] != '\0' && strchr(ignore_fields, p[2]) != 0) {
|
|
*cont = skip_field(bufend, match + matchlen);
|
|
return 0;
|
|
}
|
|
if (!start)
|
|
break;
|
|
had_percent = 1;
|
|
}
|
|
if (p <= buf) {
|
|
if (start)
|
|
*start = p + 1;
|
|
return 1;
|
|
}
|
|
const char *q;
|
|
for (q = p - 1; *q == ' ' || *q == '\t'; q--)
|
|
;
|
|
if (*q == '\n') {
|
|
if (start)
|
|
*start = p + 1;
|
|
break;
|
|
}
|
|
p = q;
|
|
}
|
|
p--;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
file_buffer::file_buffer()
|
|
: buffer(0), bufend(0)
|
|
{
|
|
}
|
|
|
|
file_buffer::~file_buffer()
|
|
{
|
|
a_delete buffer;
|
|
}
|
|
|
|
const char *file_buffer::get_start() const
|
|
{
|
|
return buffer ? buffer + 4 : 0;
|
|
}
|
|
|
|
const char *file_buffer::get_end() const
|
|
{
|
|
return bufend;
|
|
}
|
|
|
|
int file_buffer::load(int fd, const char *filename)
|
|
{
|
|
struct stat sb;
|
|
if (fstat(fd, &sb) < 0)
|
|
error("can't fstat `%1': %2", filename, strerror(errno));
|
|
else if (!S_ISREG(sb.st_mode))
|
|
error("`%1' is not a regular file", filename);
|
|
else {
|
|
// We need one character extra at the beginning for an additional newline
|
|
// used as a sentinel. We get 4 instead so that the read buffer will be
|
|
// word-aligned. This seems to make the read slightly faster. We also
|
|
// need one character at the end also for an additional newline used as a
|
|
// sentinel.
|
|
int size = int(sb.st_size);
|
|
buffer = new char[size + 4 + 1];
|
|
int nread = read(fd, buffer + 4, size);
|
|
if (nread < 0)
|
|
error("error reading `%1': %2", filename, strerror(errno));
|
|
else if (nread != size)
|
|
error("size of `%1' decreased", filename);
|
|
else {
|
|
char c;
|
|
nread = read(fd, &c, 1);
|
|
if (nread != 0)
|
|
error("size of `%1' increased", filename);
|
|
else if (memchr(buffer + 4, '\0', size < 1024 ? size : 1024) != 0)
|
|
error("database `%1' is a binary file", filename);
|
|
else {
|
|
close(fd);
|
|
buffer[3] = '\n';
|
|
int sidx = 4, didx = 4;
|
|
for ( ; sidx < size + 4; sidx++, didx++)
|
|
{
|
|
if (buffer[sidx] == '\r')
|
|
{
|
|
if (buffer[++sidx] != '\n')
|
|
buffer[didx++] = '\r';
|
|
else
|
|
size--;
|
|
}
|
|
if (sidx != didx)
|
|
buffer[didx] = buffer[sidx];
|
|
}
|
|
bufend = buffer + 4 + size;
|
|
if (bufend[-1] != '\n')
|
|
*bufend++ = '\n';
|
|
return 1;
|
|
}
|
|
}
|
|
a_delete buffer;
|
|
buffer = 0;
|
|
}
|
|
close(fd);
|
|
return 0;
|
|
}
|
|
|
|
linear_searcher::linear_searcher(const char *query, int query_len,
|
|
const char *ign, int trunc)
|
|
: ignore_fields(ign), truncate_len(trunc), keys(0), nkeys(0)
|
|
{
|
|
const char *query_end = query + query_len;
|
|
int nk = 0;
|
|
const char *p;
|
|
for (p = query; p < query_end; p++)
|
|
if (map[uchar(*p)] != '\0'
|
|
&& (p[1] == '\0' || map[uchar(p[1])] == '\0'))
|
|
nk++;
|
|
if (nk == 0)
|
|
return;
|
|
keys = new bmpattern*[nk];
|
|
p = query;
|
|
for (;;) {
|
|
while (p < query_end && map[uchar(*p)] == '\0')
|
|
p++;
|
|
if (p == query_end)
|
|
break;
|
|
const char *start = p;
|
|
while (p < query_end && map[uchar(*p)] != '\0')
|
|
p++;
|
|
keys[nkeys++] = new bmpattern(start, p - start);
|
|
}
|
|
assert(nkeys <= nk);
|
|
if (nkeys == 0) {
|
|
a_delete keys;
|
|
keys = 0;
|
|
}
|
|
}
|
|
|
|
linear_searcher::~linear_searcher()
|
|
{
|
|
for (int i = 0; i < nkeys; i++)
|
|
delete keys[i];
|
|
a_delete keys;
|
|
}
|
|
|
|
int linear_searcher::search(const char *buffer, const char *bufend,
|
|
const char **startp, int *lengthp) const
|
|
{
|
|
assert(bufend - buffer > 0);
|
|
assert(buffer[-1] == '\n');
|
|
assert(bufend[-1] == '\n');
|
|
if (nkeys == 0)
|
|
return 0;
|
|
for (;;) {
|
|
const char *refstart;
|
|
const char *found = search_and_check(keys[0], buffer, bufend, &refstart);
|
|
if (!found)
|
|
break;
|
|
const char *refend = find_end(bufend, found + keys[0]->length());
|
|
int i;
|
|
for (i = 1; i < nkeys; i++)
|
|
if (!search_and_check(keys[i], refstart, refend))
|
|
break;
|
|
if (i >= nkeys) {
|
|
*startp = refstart;
|
|
*lengthp = refend - refstart;
|
|
return 1;
|
|
}
|
|
buffer = refend;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
class linear_search_item : public search_item {
|
|
file_buffer fbuf;
|
|
public:
|
|
linear_search_item(const char *filename, int fid);
|
|
~linear_search_item();
|
|
int load(int fd);
|
|
search_item_iterator *make_search_item_iterator(const char *);
|
|
friend class linear_search_item_iterator;
|
|
};
|
|
|
|
class linear_search_item_iterator : public search_item_iterator {
|
|
linear_search_item *lsi;
|
|
int pos;
|
|
public:
|
|
linear_search_item_iterator(linear_search_item *, const char *query);
|
|
~linear_search_item_iterator();
|
|
int next(const linear_searcher &, const char **ptr, int *lenp,
|
|
reference_id *ridp);
|
|
};
|
|
|
|
search_item *make_linear_search_item(int fd, const char *filename, int fid)
|
|
{
|
|
linear_search_item *item = new linear_search_item(filename, fid);
|
|
if (!item->load(fd)) {
|
|
delete item;
|
|
return 0;
|
|
}
|
|
else
|
|
return item;
|
|
}
|
|
|
|
linear_search_item::linear_search_item(const char *filename, int fid)
|
|
: search_item(filename, fid)
|
|
{
|
|
}
|
|
|
|
linear_search_item::~linear_search_item()
|
|
{
|
|
}
|
|
|
|
int linear_search_item::load(int fd)
|
|
{
|
|
return fbuf.load(fd, name);
|
|
}
|
|
|
|
search_item_iterator *linear_search_item::make_search_item_iterator(
|
|
const char *query)
|
|
{
|
|
return new linear_search_item_iterator(this, query);
|
|
}
|
|
|
|
linear_search_item_iterator::linear_search_item_iterator(
|
|
linear_search_item *p, const char *)
|
|
: lsi(p), pos(0)
|
|
{
|
|
}
|
|
|
|
linear_search_item_iterator::~linear_search_item_iterator()
|
|
{
|
|
}
|
|
|
|
int linear_search_item_iterator::next(const linear_searcher &searcher,
|
|
const char **startp, int *lengthp,
|
|
reference_id *ridp)
|
|
{
|
|
const char *bufstart = lsi->fbuf.get_start();
|
|
const char *bufend = lsi->fbuf.get_end();
|
|
const char *ptr = bufstart + pos;
|
|
if (ptr < bufend && searcher.search(ptr, bufend, startp, lengthp)) {
|
|
pos = *startp + *lengthp - bufstart;
|
|
if (ridp)
|
|
*ridp = reference_id(lsi->filename_id, *startp - bufstart);
|
|
return 1;
|
|
}
|
|
else
|
|
return 0;
|
|
}
|