freebsd-dev/usr.bin/sgmls/libsgmls/sgmls.c
John Fieber 61614fe0ad The sgmls SGML parser. Support for our hypertext documents.
Reviewed by:	Paul Richards, Garrett Wollman
1995-04-27 16:03:47 +00:00

1037 lines
21 KiB
C

/* sgmls.c:
Library for reading output of sgmls.
Written by James Clark (jjc@jclark.com). */
#include "config.h"
#include "std.h"
#include "sgmls.h"
#include "lineout.h"
#ifdef __GNUC__
#define NO_RETURN volatile
#else
#define NO_RETURN /* as nothing */
#endif
#ifdef USE_PROTOTYPES
#define P(parms) parms
#else
#define P(parms) ()
#endif
#ifndef __STDC__
#define const /* as nothing */
#endif
typedef struct sgmls_data data_s;
typedef struct sgmls_notation notation_s;
typedef struct sgmls_internal_entity internal_entity_s;
typedef struct sgmls_external_entity external_entity_s;
typedef struct sgmls_entity entity_s;
typedef struct sgmls_attribute attribute_s;
typedef struct sgmls_event event_s;
/* lists are sorted in reverse order of level */
struct list {
int subdoc_level; /* -1 if associated with finished subdoc */
struct list *next;
char *name;
};
struct entity_list {
int subdoc_level;
struct entity_list *next;
entity_s entity;
};
struct notation_list {
int subdoc_level;
struct notation_list *next;
notation_s notation;
};
struct sgmls {
FILE *fp;
char *buf;
unsigned buf_size;
struct entity_list *entities;
struct notation_list *notations;
attribute_s *attributes;
unsigned long lineno;
char *filename;
unsigned filename_size;
unsigned long input_lineno;
int subdoc_level;
char **files; /* from `f' commands */
int nfiles;
char *sysid; /* from `s' command */
char *pubid; /* from `p' command */
};
enum error_code {
E_ZERO, /* Not an error */
E_NOMEM, /* Out of memory */
E_BADESCAPE, /* Bad escape */
E_NULESCAPE, /* \000 other than in data */
E_NUL, /* A null input character */
E_BADENTITY, /* Reference to undefined entity */
E_INTERNALENTITY, /* Internal entity when external was needed */
E_SYSTEM, /* System input error */
E_COMMAND, /* Bad command letter */
E_MISSING, /* Missing arguments */
E_NUMBER, /* Not a number */
E_ATTR, /* Bad attribute type */
E_BADNOTATION, /* Reference to undefined notation */
E_BADINTERNAL, /* Bad internal entity type */
E_BADEXTERNAL, /* Bad external entity type */
E_EOF, /* EOF in middle of line */
E_SDATA, /* \| other than in data */
E_LINELENGTH /* line longer than UNSIGNED_MAX */
};
static char *errlist[] = {
0,
"Out of memory",
"Bad escape",
"\\0 escape not in data",
"Nul character in input",
"Reference to undefined entity",
"Internal entity when external was needed",
"System input error",
"Bad command letter",
"Missing arguments",
"Not a number",
"Bad attribute type",
"Reference to undefined notation",
"Bad internal entity type",
"Bad external entity type",
"EOF in middle of line",
"\\| other than in data",
"Too many V commands",
"Input line too long"
};
static void NO_RETURN error P((enum error_code));
static int parse_data P((char *, unsigned long *));
static void parse_location P((char *, struct sgmls *));
static void parse_notation P((char *, notation_s *));
static void parse_internal_entity P((char *, internal_entity_s *));
static void parse_external_entity
P((char *, struct sgmls *, external_entity_s *));
static void parse_subdoc_entity P((char *, external_entity_s *));
static attribute_s *parse_attribute P((struct sgmls *, char *));
static void grow_datav P((void));
static char *unescape P((char *));
static char *unescape_file P((char *));
static int unescape1 P((char *));
static char *scan_token P((char **));
static int count_args P((char *));
static struct list *list_find P((struct list *, char *, int));
static UNIV xmalloc P((unsigned));
static UNIV xrealloc P((UNIV , unsigned));
static char *strsave P((char *));
static int read_line P((struct sgmls *));
static notation_s *lookup_notation P((struct sgmls *, char *));
static entity_s *lookup_entity P((struct sgmls *, char *));
static external_entity_s *lookup_external_entity P((struct sgmls *, char *));
static void define_external_entity P((struct sgmls *, external_entity_s *));
static void define_internal_entity P((struct sgmls *, internal_entity_s *));
static void define_notation P((struct sgmls *, notation_s *));
static data_s *copy_data P((data_s *, int));
static void list_finish_level P((struct list **, int));
static void add_attribute P((attribute_s **, attribute_s *));
static void default_errhandler P((int, char *, unsigned long));
#define xfree(s) do { if (s) free(s); } while (0)
static sgmls_errhandler *errhandler = default_errhandler;
static unsigned long input_lineno = 0;
static data_s *datav = 0;
static int datav_size = 0;
struct sgmls *sgmls_create(fp)
FILE *fp;
{
struct sgmls *sp;
sp = (struct sgmls *)malloc(sizeof(struct sgmls));
if (!sp)
return 0;
sp->fp = fp;
sp->entities = 0;
sp->notations = 0;
sp->attributes = 0;
sp->lineno = 0;
sp->filename = 0;
sp->filename_size = 0;
sp->input_lineno = 0;
sp->buf_size = 0;
sp->buf = 0;
sp->subdoc_level = 0;
sp->files = 0;
sp->nfiles = 0;
sp->sysid = 0;
sp->pubid = 0;
return sp;
}
void sgmls_free(sp)
struct sgmls *sp;
{
struct entity_list *ep;
struct notation_list *np;
if (!sp)
return;
xfree(sp->filename);
sgmls_free_attributes(sp->attributes);
for (ep = sp->entities; ep;) {
struct entity_list *tem = ep->next;
if (ep->entity.is_internal) {
xfree(ep->entity.u.internal.data.s);
free(ep->entity.u.internal.name);
}
else {
int i;
for (i = 0; i < ep->entity.u.external.nfilenames; i++)
xfree(ep->entity.u.external.filenames[i]);
xfree(ep->entity.u.external.filenames);
xfree(ep->entity.u.external.sysid);
xfree(ep->entity.u.external.pubid);
sgmls_free_attributes(ep->entity.u.external.attributes);
free(ep->entity.u.internal.name);
}
free(ep);
ep = tem;
}
for (np = sp->notations; np;) {
struct notation_list *tem = np->next;
xfree(np->notation.sysid);
xfree(np->notation.pubid);
free(np->notation.name);
free(np);
np = tem;
}
xfree(sp->buf);
xfree(sp->pubid);
xfree(sp->sysid);
if (sp->files) {
int i;
for (i = 0; i < sp->nfiles; i++)
free(sp->files[i]);
free(sp->files);
}
free(sp);
xfree(datav);
datav = 0;
datav_size = 0;
}
sgmls_errhandler *sgmls_set_errhandler(handler)
sgmls_errhandler *handler;
{
sgmls_errhandler *old = errhandler;
if (handler)
errhandler = handler;
return old;
}
int sgmls_next(sp, e)
struct sgmls *sp;
event_s *e;
{
while (read_line(sp)) {
char *buf = sp->buf;
e->filename = sp->filename;
e->lineno = sp->lineno;
switch (buf[0]) {
case DATA_CODE:
e->u.data.n = parse_data(buf + 1, &sp->lineno);
e->u.data.v = datav;
e->type = SGMLS_EVENT_DATA;
return 1;
case START_CODE:
{
char *p;
e->u.start.attributes = sp->attributes;
sp->attributes = 0;
e->type = SGMLS_EVENT_START;
p = buf + 1;
e->u.start.gi = scan_token(&p);
return 1;
}
case END_CODE:
{
char *p = buf + 1;
e->type = SGMLS_EVENT_END;
e->u.end.gi = scan_token(&p);
return 1;
}
case START_SUBDOC_CODE:
case END_SUBDOC_CODE:
{
char *p = buf + 1;
char *name = scan_token(&p);
if (buf[0] == START_SUBDOC_CODE) {
e->u.entity = lookup_external_entity(sp, name);
sp->subdoc_level++;
e->type = SGMLS_EVENT_SUBSTART;
}
else {
e->type = SGMLS_EVENT_SUBEND;
list_finish_level((struct list **)&sp->entities, sp->subdoc_level);
list_finish_level((struct list **)&sp->notations, sp->subdoc_level);
sp->subdoc_level--;
e->u.entity = lookup_external_entity(sp, name);
}
return 1;
}
case ATTRIBUTE_CODE:
add_attribute(&sp->attributes, parse_attribute(sp, buf + 1));
break;
case DATA_ATTRIBUTE_CODE:
{
char *p = buf + 1;
char *name;
attribute_s *a;
external_entity_s *ext;
name = scan_token(&p);
a = parse_attribute(sp, p);
ext = lookup_external_entity(sp, name);
add_attribute(&ext->attributes, a);
}
break;
case REFERENCE_ENTITY_CODE:
{
char *p = buf + 1;
char *name;
name = scan_token(&p);
e->u.entity = lookup_external_entity(sp, name);
e->type = SGMLS_EVENT_ENTITY;
return 1;
}
case DEFINE_NOTATION_CODE:
{
notation_s notation;
parse_notation(buf + 1, &notation);
define_notation(sp, &notation);
}
break;
case DEFINE_EXTERNAL_ENTITY_CODE:
{
external_entity_s external;
parse_external_entity(buf + 1, sp, &external);
define_external_entity(sp, &external);
}
break;
case DEFINE_SUBDOC_ENTITY_CODE:
{
external_entity_s external;
parse_subdoc_entity(buf + 1, &external);
define_external_entity(sp, &external);
}
break;
case DEFINE_INTERNAL_ENTITY_CODE:
{
internal_entity_s internal;
parse_internal_entity(buf + 1, &internal);
define_internal_entity(sp, &internal);
}
break;
case PI_CODE:
e->u.pi.len = unescape1(buf + 1);
e->u.pi.s = buf + 1;
e->type = SGMLS_EVENT_PI;
return 1;
case LOCATION_CODE:
parse_location(buf + 1, sp);
break;
case APPINFO_CODE:
e->u.appinfo = unescape(buf + 1);
e->type = SGMLS_EVENT_APPINFO;
return 1;
case SYSID_CODE:
sp->sysid = strsave(unescape(buf + 1));
break;
case PUBID_CODE:
sp->pubid = strsave(unescape(buf + 1));
break;
case FILE_CODE:
sp->files = xrealloc(sp->files, (sp->nfiles + 1)*sizeof(char *));
sp->files[sp->nfiles] = strsave(unescape_file(buf + 1));
sp->nfiles += 1;
break;
case CONFORMING_CODE:
e->type = SGMLS_EVENT_CONFORMING;
return 1;
default:
error(E_COMMAND);
}
}
return 0;
}
static
int parse_data(p, linenop)
char *p;
unsigned long *linenop;
{
int n = 0;
char *start = p;
char *q;
int is_sdata = 0;
/* No need to copy before first escape. */
for (; *p != '\\' && *p != '\0'; p++)
;
q = p;
while (*p) {
if (*p == '\\') {
switch (*++p) {
case '\\':
*q++ = *p++;
break;
case 'n':
*q++ = RECHAR;
*linenop += 1;
p++;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{
int val = *p++ - '0';
if (*p >= '0' && *p <= '7') {
val = val*8 + (*p++ - '0');
if (*p >= '0' && *p <= '7')
val = val*8 + (*p++ - '0');
}
*q++ = (char)val;
}
break;
case '|':
if (q > start || is_sdata) {
if (n >= datav_size)
grow_datav();
datav[n].s = start;
datav[n].len = q - start;
datav[n].is_sdata = is_sdata;
n++;
}
is_sdata = !is_sdata;
start = q;
p++;
break;
default:
error(E_BADESCAPE);
}
}
else
*q++ = *p++;
}
if (q > start || is_sdata) {
if (n >= datav_size)
grow_datav();
datav[n].s = start;
datav[n].len = q - start;
datav[n].is_sdata = is_sdata;
n++;
}
return n;
}
static
void grow_datav()
{
unsigned size = datav_size ? 2*datav_size : 2;
datav = (data_s *)xrealloc((UNIV)datav, size*sizeof(data_s));
datav_size = size;
}
static
void parse_location(s, sp)
char *s;
struct sgmls *sp;
{
unsigned size;
if (*s < '0' || *s > '9' || sscanf(s, "%lu", &sp->lineno) != 1)
error(E_NUMBER);
do {
++s;
} while (*s >= '0' && *s <= '9');
if (*s != ' ')
return;
s++;
s = unescape_file(s);
size = strlen(s) + 1;
if (size <= sp->filename_size)
strcpy(sp->filename, s);
else {
sp->filename = xrealloc(sp->filename, size);
strcpy(sp->filename, s);
sp->filename_size = size;
}
}
static
void parse_notation(s, n)
char *s;
notation_s *n;
{
n->name = strsave(scan_token(&s));
}
static
void parse_internal_entity(s, e)
char *s;
internal_entity_s *e;
{
char *type;
e->name = strsave(scan_token(&s));
type = scan_token(&s);
if (strcmp(type, "CDATA") == 0)
e->data.is_sdata = 0;
else if (strcmp(type, "SDATA") == 0)
e->data.is_sdata = 1;
else
error(E_BADINTERNAL);
e->data.len = unescape1(s);
if (e->data.len == 0)
e->data.s = 0;
else {
e->data.s = xmalloc(e->data.len);
memcpy(e->data.s, s, e->data.len);
}
}
static
void parse_external_entity(s, sp, e)
char *s;
struct sgmls *sp;
external_entity_s *e;
{
char *type;
char *notation;
e->name = strsave(scan_token(&s));
type = scan_token(&s);
if (strcmp(type, "CDATA") == 0)
e->type = SGMLS_ENTITY_CDATA;
else if (strcmp(type, "SDATA") == 0)
e->type = SGMLS_ENTITY_SDATA;
else if (strcmp(type, "NDATA") == 0)
e->type = SGMLS_ENTITY_NDATA;
else
error(E_BADEXTERNAL);
notation = scan_token(&s);
e->notation = lookup_notation(sp, notation);
}
static
void parse_subdoc_entity(s, e)
char *s;
external_entity_s *e;
{
e->name = strsave(scan_token(&s));
e->type = SGMLS_ENTITY_SUBDOC;
}
static
attribute_s *parse_attribute(sp, s)
struct sgmls *sp;
char *s;
{
attribute_s *a;
char *type;
a = (attribute_s *)xmalloc(sizeof(*a));
a->name = strsave(scan_token(&s));
type = scan_token(&s);
if (strcmp(type, "CDATA") == 0) {
unsigned long lineno = 0;
a->type = SGMLS_ATTR_CDATA;
a->value.data.n = parse_data(s, &lineno);
a->value.data.v = copy_data(datav, a->value.data.n);
}
else if (strcmp(type, "IMPLIED") == 0) {
a->type = SGMLS_ATTR_IMPLIED;
}
else if (strcmp(type, "NOTATION") == 0) {
a->type = SGMLS_ATTR_NOTATION;
a->value.notation = lookup_notation(sp, scan_token(&s));
}
else if (strcmp(type, "ENTITY") == 0) {
int n, i;
a->type = SGMLS_ATTR_ENTITY;
n = count_args(s);
if (n == 0)
error(E_MISSING);
a->value.entity.v = (entity_s **)xmalloc(n*sizeof(entity_s *));
a->value.entity.n = n;
for (i = 0; i < n; i++)
a->value.entity.v[i] = lookup_entity(sp, scan_token(&s));
}
else if (strcmp(type, "TOKEN") == 0) {
int n, i;
a->type = SGMLS_ATTR_TOKEN;
n = count_args(s);
if (n == 0)
error(E_MISSING);
a->value.token.v = (char **)xmalloc(n * sizeof(char *));
for (i = 0; i < n; i++)
a->value.token.v[i] = strsave(scan_token(&s));
a->value.token.n = n;
}
else
error(E_ATTR);
return a;
}
void sgmls_free_attributes(p)
attribute_s *p;
{
while (p) {
attribute_s *nextp = p->next;
switch (p->type) {
case SGMLS_ATTR_CDATA:
if (p->value.data.v) {
free(p->value.data.v[0].s);
free(p->value.data.v);
}
break;
case SGMLS_ATTR_TOKEN:
{
int i;
for (i = 0; i < p->value.token.n; i++)
free(p->value.token.v[i]);
xfree(p->value.token.v);
}
break;
case SGMLS_ATTR_ENTITY:
xfree(p->value.entity.v);
break;
case SGMLS_ATTR_IMPLIED:
case SGMLS_ATTR_NOTATION:
break;
}
free(p->name);
free(p);
p = nextp;
}
}
static
data_s *copy_data(v, n)
data_s *v;
int n;
{
if (n == 0)
return 0;
else {
int i;
unsigned total;
char *p;
data_s *result;
result = (data_s *)xmalloc(n*sizeof(data_s));
total = 0;
for (i = 0; i < n; i++)
total += v[i].len;
if (!total)
total++;
p = xmalloc(total);
for (i = 0; i < n; i++) {
result[i].s = p;
memcpy(result[i].s, v[i].s, v[i].len);
result[i].len = v[i].len;
p += v[i].len;
result[i].is_sdata = v[i].is_sdata;
}
return result;
}
}
/* Unescape s, and return nul-terminated data. Give an error
if the data contains 0. */
static
char *unescape(s)
char *s;
{
int len = unescape1(s);
if (memchr(s, '\0', len))
error(E_NULESCAPE);
s[len] = '\0';
return s;
}
/* Like unescape(), but REs are represented by 012 not 015. */
static
char *unescape_file(s)
char *s;
{
char *p;
p = s = unescape(s);
while ((p = strchr(p, RECHAR)) != 0)
*p++ = '\n';
return s;
}
/* Unescape s, and return length of data. The data may contain 0. */
static
int unescape1(s)
char *s;
{
const char *p;
char *q;
q = strchr(s, '\\');
if (!q)
return strlen(s);
p = q;
while (*p) {
if (*p == '\\') {
switch (*++p) {
case '\\':
*q++ = *p++;
break;
case 'n':
*q++ = RECHAR;
p++;
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
{
int val = *p++ - '0';
if (*p >= '0' && *p <= '7') {
val = val*8 + (*p++ - '0');
if (*p >= '0' && *p <= '7')
val = val*8 + (*p++ - '0');
}
*q++ = (char)val;
}
break;
case '|':
error(E_SDATA);
default:
error(E_BADESCAPE);
}
}
else
*q++ = *p++;
}
return q - s;
}
static
char *scan_token(pp)
char **pp;
{
char *start = *pp;
while (**pp != '\0') {
if (**pp == ' ') {
**pp = '\0';
*pp += 1;
break;
}
*pp += 1;
}
if (!*start)
error(E_MISSING);
return start;
}
static
int count_args(p)
char *p;
{
int n = 0;
while (*p != '\0') {
n++;
do {
++p;
if (*p == ' ') {
p++;
break;
}
} while (*p != '\0');
}
return n;
}
static
int read_line(sp)
struct sgmls *sp;
{
unsigned i = 0;
FILE *fp = sp->fp;
int c;
char *buf = sp->buf;
unsigned buf_size = sp->buf_size;
c = getc(fp);
if (c == EOF) {
input_lineno = sp->input_lineno;
if (ferror(fp))
error(E_SYSTEM);
return 0;
}
sp->input_lineno++;
input_lineno = sp->input_lineno;
for (;;) {
if (i >= buf_size) {
if (buf_size == 0)
buf_size = 24;
else if (buf_size > (unsigned)UINT_MAX/2) {
if (buf_size == (unsigned)UINT_MAX)
error(E_LINELENGTH);
buf_size = (unsigned)UINT_MAX;
}
else
buf_size *= 2;
buf = xrealloc(buf, buf_size);
sp->buf = buf;
sp->buf_size = buf_size;
}
if (c == '\0')
error(E_NUL);
if (c == '\n') {
buf[i] = '\0';
break;
}
buf[i++] = c;
c = getc(fp);
if (c == EOF) {
if (ferror(fp))
error(E_SYSTEM);
else
error(E_EOF);
}
}
return 1;
}
static
notation_s *lookup_notation(sp, name)
struct sgmls *sp;
char *name;
{
struct notation_list *p
= (struct notation_list *)list_find((struct list *)sp->notations, name,
sp->subdoc_level);
if (!p)
error(E_BADNOTATION);
return &p->notation;
}
static
entity_s *lookup_entity(sp, name)
struct sgmls *sp;
char *name;
{
struct entity_list *p
= (struct entity_list *)list_find((struct list *)sp->entities, name,
sp->subdoc_level);
if (!p)
error(E_BADENTITY);
return &p->entity;
}
static
external_entity_s *lookup_external_entity(sp, name)
struct sgmls *sp;
char *name;
{
entity_s *p = lookup_entity(sp, name);
if (p->is_internal)
error(E_INTERNALENTITY);
return &p->u.external;
}
static
void define_external_entity(sp, e)
struct sgmls *sp;
external_entity_s *e;
{
struct entity_list *p;
e->attributes = 0;
e->filenames = sp->files;
e->nfilenames = sp->nfiles;
sp->files = 0;
sp->nfiles = 0;
e->pubid = sp->pubid;
sp->pubid = 0;
e->sysid = sp->sysid;
sp->sysid = 0;
p = (struct entity_list *)xmalloc(sizeof(struct entity_list));
memcpy((UNIV)&p->entity.u.external, (UNIV)e, sizeof(*e));
p->entity.is_internal = 0;
p->subdoc_level = sp->subdoc_level;
p->next = sp->entities;
sp->entities = p;
}
static
void define_internal_entity(sp, e)
struct sgmls *sp;
internal_entity_s *e;
{
struct entity_list *p;
p = (struct entity_list *)xmalloc(sizeof(struct entity_list));
memcpy((UNIV)&p->entity.u.internal, (UNIV)e, sizeof(*e));
p->entity.is_internal = 1;
p->subdoc_level = sp->subdoc_level;
p->next = sp->entities;
sp->entities = p;
}
static
void define_notation(sp, np)
struct sgmls *sp;
notation_s *np;
{
struct notation_list *p;
np->sysid = sp->sysid;
sp->sysid = 0;
np->pubid = sp->pubid;
sp->pubid = 0;
p = (struct notation_list *)xmalloc(sizeof(struct notation_list));
memcpy((UNIV)&p->notation, (UNIV)np, sizeof(*np));
p->subdoc_level = sp->subdoc_level;
p->next = sp->notations;
sp->notations = p;
}
static
struct list *list_find(p, name, level)
struct list *p;
char *name;
int level;
{
for (; p && p->subdoc_level == level; p = p->next)
if (strcmp(p->name, name) == 0)
return p;
return 0;
}
/* Move all the items in the list whose subdoc level is level to the
end of the list and make their subdoc_level -1. */
static
void list_finish_level(listp, level)
struct list **listp;
int level;
{
struct list **pp, *next_level, *old_level;
for (pp = listp; *pp && (*pp)->subdoc_level == level; pp = &(*pp)->next)
(*pp)->subdoc_level = -1;
next_level = *pp;
*pp = 0;
old_level = *listp;
*listp = next_level;
for (pp = listp; *pp; pp = &(*pp)->next)
;
*pp = old_level;
}
static
void add_attribute(pp, a)
attribute_s **pp, *a;
{
for (; *pp && strcmp((*pp)->name, a->name) < 0; pp = &(*pp)->next)
;
a->next = *pp;
*pp = a;
}
static
char *strsave(s)
char *s;
{
if (!s)
return s;
else {
char *p = xmalloc(strlen(s) + 1);
strcpy(p, s);
return p;
}
}
static
UNIV xmalloc(n)
unsigned n;
{
UNIV p = malloc(n);
if (!p)
error(E_NOMEM);
return p;
}
/* ANSI C says first argument to realloc can be NULL, but not everybody
appears to support this. */
static
UNIV xrealloc(p, n)
UNIV p;
unsigned n;
{
p = p ? realloc(p, n) : malloc(n);
if (!p)
error(E_NOMEM);
return p;
}
static NO_RETURN
void error(num)
enum error_code num;
{
(*errhandler)((int)num, errlist[num], input_lineno);
abort();
}
static
void default_errhandler(num, msg, lineno)
int num;
char *msg;
unsigned long lineno;
{
fprintf(stderr, "Line %lu: %s\n", lineno, msg);
exit(1);
}