freebsd-dev/usr.bin/indent/lexi.c
Bruce Evans 611e9fdbb4 Expanded the size of the keyword table from 100 to 1000 entries so that
the number of typedef-names is not so limited.  Same as in rev.1.4.

Added the "const" and "volatile" to the keyword table.  Rev.1.4 added
these but they were misclassified so they were not formatted as types.
indent still doesn't really understand them.  E.g., it mangles
"char * const *foo" and "char *const *foo".  This change mainly stops
it mangling "char const foo" to "char<declaration-indent>const foo".
2004-02-09 13:13:03 +00:00

593 lines
15 KiB
C

/*
* Copyright (c) 1985 Sun Microsystems, Inc.
* Copyright (c) 1980, 1993
* The Regents of the University of California. All rights reserved.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#if 0
#ifndef lint
static char sccsid[] = "@(#)lexi.c 8.1 (Berkeley) 6/6/93";
#endif /* not lint */
#endif
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
/*
* Here we have the token scanner for indent. It scans off one token and puts
* it in the global variable "token". It returns a code, indicating the type
* of token scanned.
*/
#include <err.h>
#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "indent_globs.h"
#include "indent_codes.h"
#include "indent.h"
#define alphanum 1
#define opchar 3
struct templ {
const char *rwd;
int rwcode;
};
struct templ specials[1000] =
{
{"switch", 1},
{"case", 2},
{"break", 0},
{"struct", 3},
{"union", 3},
{"enum", 3},
{"default", 2},
{"int", 4},
{"char", 4},
{"float", 4},
{"double", 4},
{"long", 4},
{"short", 4},
{"typdef", 4},
{"unsigned", 4},
{"register", 4},
{"static", 4},
{"global", 4},
{"extern", 4},
{"void", 4},
{"const", 4},
{"volatile", 4},
{"goto", 0},
{"return", 0},
{"if", 5},
{"while", 5},
{"for", 5},
{"else", 6},
{"do", 6},
{"sizeof", 7},
{0, 0}
};
char chartype[128] =
{ /* this is used to facilitate the decision of
* what type (alphanumeric, operator) each
* character is */
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 3, 0, 0, 1, 3, 3, 0,
0, 0, 3, 3, 0, 3, 0, 3,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 0, 0, 3, 3, 3, 3,
0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 0, 0, 3, 1,
0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 0, 3, 0, 3, 0
};
int
lexi(void)
{
int unary_delim; /* this is set to 1 if the current token
* forces a following operator to be unary */
static int last_code; /* the last token type returned */
static int l_struct; /* set to 1 if the last token was 'struct' */
int code; /* internal code to be returned */
char qchar; /* the delimiter character for a string */
e_token = s_token; /* point to start of place to save token */
unary_delim = false;
ps.col_1 = ps.last_nl; /* tell world that this token started in
* column 1 iff the last thing scanned was nl */
ps.last_nl = false;
while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
ps.col_1 = false; /* leading blanks imply token is not in column
* 1 */
if (++buf_ptr >= buf_end)
fill_buffer();
}
/* Scan an alphanumeric token */
if (chartype[(int)*buf_ptr] == alphanum || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
/*
* we have a character or number
*/
const char *j; /* used for searching thru list of
*
* reserved words */
struct templ *p;
if (isdigit(*buf_ptr) || (buf_ptr[0] == '.' && isdigit(buf_ptr[1]))) {
int seendot = 0,
seenexp = 0,
seensfx = 0;
if (*buf_ptr == '0' &&
(buf_ptr[1] == 'x' || buf_ptr[1] == 'X')) {
*e_token++ = *buf_ptr++;
*e_token++ = *buf_ptr++;
while (isxdigit(*buf_ptr)) {
CHECK_SIZE_TOKEN;
*e_token++ = *buf_ptr++;
}
}
else
while (1) {
if (*buf_ptr == '.') {
if (seendot)
break;
else
seendot++;
}
CHECK_SIZE_TOKEN;
*e_token++ = *buf_ptr++;
if (!isdigit(*buf_ptr) && *buf_ptr != '.') {
if ((*buf_ptr != 'E' && *buf_ptr != 'e') || seenexp)
break;
else {
seenexp++;
seendot++;
CHECK_SIZE_TOKEN;
*e_token++ = *buf_ptr++;
if (*buf_ptr == '+' || *buf_ptr == '-')
*e_token++ = *buf_ptr++;
}
}
}
while (1) {
if (!(seensfx & 1) &&
(*buf_ptr == 'U' || *buf_ptr == 'u')) {
CHECK_SIZE_TOKEN;
*e_token++ = *buf_ptr++;
seensfx |= 1;
continue;
}
if (!(seensfx & 2) &&
(*buf_ptr == 'L' || *buf_ptr == 'l')) {
CHECK_SIZE_TOKEN;
if (buf_ptr[1] == buf_ptr[0])
*e_token++ = *buf_ptr++;
*e_token++ = *buf_ptr++;
seensfx |= 2;
continue;
}
break;
}
}
else
while (chartype[(int)*buf_ptr] == alphanum || *buf_ptr == BACKSLASH) {
/* fill_buffer() terminates buffer with newline */
if (*buf_ptr == BACKSLASH) {
if (*(buf_ptr + 1) == '\n') {
buf_ptr += 2;
if (buf_ptr >= buf_end)
fill_buffer();
} else
break;
}
CHECK_SIZE_TOKEN;
/* copy it over */
*e_token++ = *buf_ptr++;
if (buf_ptr >= buf_end)
fill_buffer();
}
*e_token++ = '\0';
while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
if (++buf_ptr >= buf_end)
fill_buffer();
}
ps.its_a_keyword = false;
ps.sizeof_keyword = false;
if (l_struct) { /* if last token was 'struct', then this token
* should be treated as a declaration */
l_struct = false;
last_code = ident;
ps.last_u_d = true;
return (decl);
}
ps.last_u_d = false; /* Operator after identifier is binary */
last_code = ident; /* Remember that this is the code we will
* return */
/*
* This loop will check if the token is a keyword.
*/
for (p = specials; (j = p->rwd) != 0; p++) {
const char *q = s_token; /* point at scanned token */
if (*j++ != *q++ || *j++ != *q++)
continue; /* This test depends on the fact that
* identifiers are always at least 1 character
* long (ie. the first two bytes of the
* identifier are always meaningful) */
if (q[-1] == 0)
break; /* If its a one-character identifier */
while (*q++ == *j)
if (*j++ == 0)
goto found_keyword; /* I wish that C had a multi-level
* break... */
}
if (p->rwd) { /* we have a keyword */
found_keyword:
ps.its_a_keyword = true;
ps.last_u_d = true;
switch (p->rwcode) {
case 1: /* it is a switch */
return (swstmt);
case 2: /* a case or default */
return (casestmt);
case 3: /* a "struct" */
if (ps.p_l_follow)
break; /* inside parens: cast */
l_struct = true;
/*
* Next time around, we will want to know that we have had a
* 'struct'
*/
case 4: /* one of the declaration keywords */
if (ps.p_l_follow) {
ps.cast_mask |= 1 << ps.p_l_follow;
break; /* inside parens: cast */
}
last_code = decl;
return (decl);
case 5: /* if, while, for */
return (sp_paren);
case 6: /* do, else */
return (sp_nparen);
case 7:
ps.sizeof_keyword = true;
default: /* all others are treated like any other
* identifier */
return (ident);
} /* end of switch */
} /* end of if (found_it) */
if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0) {
char *tp = buf_ptr;
while (tp < buf_end)
if (*tp++ == ')' && (*tp == ';' || *tp == ','))
goto not_proc;
strncpy(ps.procname, token, sizeof ps.procname - 1);
ps.in_parameter_declaration = 1;
rparen_count = 1;
not_proc:;
}
/*
* The following hack attempts to guess whether or not the current
* token is in fact a declaration keyword -- one that has been
* typedefd
*/
if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
&& !ps.p_l_follow
&& !ps.block_init
&& (ps.last_token == rparen || ps.last_token == semicolon ||
ps.last_token == decl ||
ps.last_token == lbrace || ps.last_token == rbrace)) {
ps.its_a_keyword = true;
ps.last_u_d = true;
last_code = decl;
return decl;
}
if (last_code == decl) /* if this is a declared variable, then
* following sign is unary */
ps.last_u_d = true; /* will make "int a -1" work */
last_code = ident;
return (ident); /* the ident is not in the list */
} /* end of procesing for alpanum character */
/* Scan a non-alphanumeric token */
*e_token++ = *buf_ptr; /* if it is only a one-character token, it is
* moved here */
*e_token = '\0';
if (++buf_ptr >= buf_end)
fill_buffer();
switch (*token) {
case '\n':
unary_delim = ps.last_u_d;
ps.last_nl = true; /* remember that we just had a newline */
code = (had_eof ? 0 : newline);
/*
* if data has been exhausted, the newline is a dummy, and we should
* return code to stop
*/
break;
case '\'': /* start of quoted character */
case '"': /* start of string */
qchar = *token;
if (troff) {
e_token[-1] = '`';
if (qchar == '"')
*e_token++ = '`';
e_token = chfont(&bodyf, &stringf, e_token);
}
do { /* copy the string */
while (1) { /* move one character or [/<char>]<char> */
if (*buf_ptr == '\n') {
printf("%d: Unterminated literal\n", line_no);
goto stop_lit;
}
CHECK_SIZE_TOKEN; /* Only have to do this once in this loop,
* since CHECK_SIZE guarantees that there
* are at least 5 entries left */
*e_token = *buf_ptr++;
if (buf_ptr >= buf_end)
fill_buffer();
if (*e_token == BACKSLASH) { /* if escape, copy extra char */
if (*buf_ptr == '\n') /* check for escaped newline */
++line_no;
if (troff) {
*++e_token = BACKSLASH;
if (*buf_ptr == BACKSLASH)
*++e_token = BACKSLASH;
}
*++e_token = *buf_ptr++;
++e_token; /* we must increment this again because we
* copied two chars */
if (buf_ptr >= buf_end)
fill_buffer();
}
else
break; /* we copied one character */
} /* end of while (1) */
} while (*e_token++ != qchar);
if (troff) {
e_token = chfont(&stringf, &bodyf, e_token - 1);
if (qchar == '"')
*e_token++ = '\'';
}
stop_lit:
code = ident;
break;
case ('('):
case ('['):
unary_delim = true;
code = lparen;
break;
case (')'):
case (']'):
code = rparen;
break;
case '#':
unary_delim = ps.last_u_d;
code = preesc;
break;
case '?':
unary_delim = true;
code = question;
break;
case (':'):
code = colon;
unary_delim = true;
break;
case (';'):
unary_delim = true;
code = semicolon;
break;
case ('{'):
unary_delim = true;
/*
* if (ps.in_or_st) ps.block_init = 1;
*/
/* ? code = ps.block_init ? lparen : lbrace; */
code = lbrace;
break;
case ('}'):
unary_delim = true;
/* ? code = ps.block_init ? rparen : rbrace; */
code = rbrace;
break;
case 014: /* a form feed */
unary_delim = ps.last_u_d;
ps.last_nl = true; /* remember this so we can set 'ps.col_1'
* right */
code = form_feed;
break;
case (','):
unary_delim = true;
code = comma;
break;
case '.':
unary_delim = false;
code = period;
break;
case '-':
case '+': /* check for -, +, --, ++ */
code = (ps.last_u_d ? unary_op : binary_op);
unary_delim = true;
if (*buf_ptr == token[0]) {
/* check for doubled character */
*e_token++ = *buf_ptr++;
/* buffer overflow will be checked at end of loop */
if (last_code == ident || last_code == rparen) {
code = (ps.last_u_d ? unary_op : postop);
/* check for following ++ or -- */
unary_delim = false;
}
}
else if (*buf_ptr == '=')
/* check for operator += */
*e_token++ = *buf_ptr++;
else if (*buf_ptr == '>') {
/* check for operator -> */
*e_token++ = *buf_ptr++;
if (!pointer_as_binop) {
unary_delim = false;
code = unary_op;
ps.want_blank = false;
}
}
break; /* buffer overflow will be checked at end of
* switch */
case '=':
if (ps.in_or_st)
ps.block_init = 1;
#ifdef undef
if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
e_token[-1] = *buf_ptr++;
if ((e_token[-1] == '<' || e_token[-1] == '>') && e_token[-1] == *buf_ptr)
*e_token++ = *buf_ptr++;
*e_token++ = '='; /* Flip =+ to += */
*e_token = 0;
}
#else
if (*buf_ptr == '=') {/* == */
*e_token++ = '='; /* Flip =+ to += */
buf_ptr++;
*e_token = 0;
}
#endif
code = binary_op;
unary_delim = true;
break;
/* can drop thru!!! */
case '>':
case '<':
case '!': /* ops like <, <<, <=, !=, etc */
if (*buf_ptr == '>' || *buf_ptr == '<' || *buf_ptr == '=') {
*e_token++ = *buf_ptr;
if (++buf_ptr >= buf_end)
fill_buffer();
}
if (*buf_ptr == '=')
*e_token++ = *buf_ptr++;
code = (ps.last_u_d ? unary_op : binary_op);
unary_delim = true;
break;
default:
if (token[0] == '/' && *buf_ptr == '*') {
/* it is start of comment */
*e_token++ = '*';
if (++buf_ptr >= buf_end)
fill_buffer();
code = comment;
unary_delim = ps.last_u_d;
break;
}
while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
/*
* handle ||, &&, etc, and also things as in int *****i
*/
*e_token++ = *buf_ptr;
if (++buf_ptr >= buf_end)
fill_buffer();
}
code = (ps.last_u_d ? unary_op : binary_op);
unary_delim = true;
} /* end of switch */
if (code != newline) {
l_struct = false;
last_code = code;
}
if (buf_ptr >= buf_end) /* check for input buffer empty */
fill_buffer();
ps.last_u_d = unary_delim;
*e_token = '\0'; /* null terminate the token */
return (code);
}
/*
* Add the given keyword to the keyword table, using val as the keyword type
*/
void
addkey(char *key, int val)
{
struct templ *p = specials;
while (p->rwd)
if (p->rwd[0] == key[0] && strcmp(p->rwd, key) == 0)
return;
else
p++;
if (p >= specials + sizeof specials / sizeof specials[0])
return; /* For now, table overflows are silently
* ignored */
p->rwd = key;
p->rwcode = val;
p[1].rwd = 0;
p[1].rwcode = 0;
}