indent(1): improve predictability of lexi()

lexi() reads the input stream and categorizes the next token. indent will
sometimes buffer up a sequence of tokens in order rearrange them. That is
needed for properly cuddling else or placing braces correctly according to
the chosen style (KNF vs Allman) when comments are around. The loop that
buffers tokens up uses lexi() to decide if it's time to stop buffering. Then
the temporary buffer is used to feed lexi() the same tokens again, this time
for normal processing.

The problem is that lexi() apart from recognizing the token, can change
a lot of information about the current state, for example ps.last_nl,
ps.keyword, buf_ptr. It also abandons leading whitespace, which is needed
mainly for comment-related considerations. So the call to lexi() while
tokens are buffered up and categorized can change the state before they're
read again for normal processing which may easily result in changing
interpretation of the current state and lead to incorrect output.

To work around the problems:
1) copy the whitespace into the save_com buffer so that it will be read
again when processed
2) trick lexi() into modifying a temporary copy of the parser state instead
of the original.
This commit is contained in:
Piotr Pawel Stefaniak 2018-06-03 14:13:11 +00:00
parent ec5ac89ecd
commit 63c3f22696
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=334560
3 changed files with 93 additions and 51 deletions

View File

@ -102,6 +102,7 @@ main(int argc, char **argv)
int last_else = 0; /* true iff last keyword was an else */
const char *profile_name = NULL;
const char *envval = NULL;
struct parser_state transient_state; /* a copy for lookup */
/*-----------------------------------------------*\
| INITIALIZATION |
@ -324,7 +325,7 @@ main(int argc, char **argv)
int is_procname;
int comment_buffered = false;
type_code = lexi(); /* lexi reads one token. The actual
type_code = lexi(&ps); /* lexi reads one token. The actual
* characters read are stored in "token". lexi
* returns a code indicating the type of token */
is_procname = ps.procname[0];
@ -460,9 +461,48 @@ main(int argc, char **argv)
break;
}
} /* end of switch */
if (type_code != 0) /* we must make this check, just in case there
* was an unexpected EOF */
type_code = lexi(); /* read another token */
/*
* We must make this check, just in case there was an unexpected
* EOF.
*/
if (type_code != 0) {
/*
* The only intended purpose of calling lexi() below is to
* categorize the next token in order to decide whether to
* continue buffering forthcoming tokens. Once the buffering
* is over, lexi() will be called again elsewhere on all of
* the tokens - this time for normal processing.
*
* Calling it for this purpose is a bug, because lexi() also
* changes the parser state and discards leading whitespace,
* which is needed mostly for comment-related considerations.
*
* Work around the former problem by giving lexi() a copy of
* the current parser state and discard it if the call turned
* out to be just a look ahead.
*
* Work around the latter problem by copying all whitespace
* characters into the buffer so that the later lexi() call
* will read them.
*/
if (sc_end != NULL) {
while (*buf_ptr == ' ' || *buf_ptr == '\t') {
*sc_end++ = *buf_ptr++;
if (sc_end >= &save_com[sc_size]) {
abort();
}
}
if (buf_ptr >= buf_end) {
fill_buffer();
}
}
transient_state = ps;
type_code = lexi(&transient_state); /* read another token */
if (type_code != newline && type_code != form_feed &&
type_code != comment && !transient_state.search_brace) {
ps = transient_state;
}
}
} /* end of while (search_brace) */
last_else = 0;
check_type:

View File

@ -36,7 +36,7 @@ int compute_code_target(void);
int compute_label_target(void);
int count_spaces(int, char *);
int count_spaces_until(int, char *, char *);
int lexi(void);
int lexi(struct parser_state *);
void diag2(int, const char *);
void diag3(int, const char *, int);
void diag4(int, const char *, int, int);

View File

@ -141,7 +141,7 @@ strcmp_type(const void *e1, const void *e2)
}
int
lexi(void)
lexi(struct parser_state *state)
{
int unary_delim; /* this is set to 1 if the current token
* forces a following operator to be unary */
@ -152,12 +152,13 @@ lexi(void)
e_token = s_token; /* point to start of place to save token */
unary_delim = false;
ps.col_1 = ps.last_nl; /* tell world that this token started in
* column 1 iff the last thing scanned was nl */
ps.last_nl = false;
state->col_1 = state->last_nl; /* tell world that this token started
* in column 1 iff the last thing
* scanned was a newline */
state->last_nl = false;
while (*buf_ptr == ' ' || *buf_ptr == '\t') { /* get rid of blanks */
ps.col_1 = false; /* leading blanks imply token is not in column
state->col_1 = false; /* leading blanks imply token is not in column
* 1 */
if (++buf_ptr >= buf_end)
fill_buffer();
@ -281,18 +282,19 @@ lexi(void)
if (++buf_ptr >= buf_end)
fill_buffer();
}
ps.keyword = 0;
if (l_struct && !ps.p_l_follow) {
state->keyword = 0;
if (l_struct && !state->p_l_follow) {
/* if last token was 'struct' and we're not
* in parentheses, then this token
* should be treated as a declaration */
l_struct = false;
last_code = ident;
ps.last_u_d = true;
state->last_u_d = true;
return (decl);
}
ps.last_u_d = l_struct; /* Operator after identifier is binary
* unless last token was 'struct' */
state->last_u_d = l_struct; /* Operator after identifier is
* binary unless last token was
* 'struct' */
l_struct = false;
last_code = ident; /* Remember that this is the code we will
* return */
@ -310,13 +312,13 @@ lexi(void)
strcmp(u, "_t") == 0) || (typename_top >= 0 &&
bsearch(s_token, typenames, typename_top + 1,
sizeof(typenames[0]), strcmp_type))) {
ps.keyword = 4; /* a type name */
ps.last_u_d = true;
state->keyword = 4; /* a type name */
state->last_u_d = true;
goto found_typename;
}
} else { /* we have a keyword */
ps.keyword = p->rwcode;
ps.last_u_d = true;
state->keyword = p->rwcode;
state->last_u_d = true;
switch (p->rwcode) {
case 7: /* it is a switch */
return (swstmt);
@ -333,9 +335,9 @@ lexi(void)
case 4: /* one of the declaration keywords */
found_typename:
if (ps.p_l_follow) {
if (state->p_l_follow) {
/* inside parens: cast, param list, offsetof or sizeof */
ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask;
break;
}
last_code = decl;
@ -358,15 +360,15 @@ lexi(void)
return (ident);
} /* end of switch */
} /* end of if (found_it) */
if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
ps.in_parameter_declaration == 0 && ps.block_init == 0) {
if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 &&
state->in_parameter_declaration == 0 && state->block_init == 0) {
char *tp = buf_ptr;
while (tp < buf_end)
if (*tp++ == ')' && (*tp == ';' || *tp == ','))
goto not_proc;
strncpy(ps.procname, token, sizeof ps.procname - 1);
if (ps.in_decl)
ps.in_parameter_declaration = 1;
strncpy(state->procname, token, sizeof state->procname - 1);
if (state->in_decl)
state->in_parameter_declaration = 1;
return (last_code = funcname);
not_proc:;
}
@ -376,19 +378,19 @@ lexi(void)
* typedefd
*/
if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
&& !ps.p_l_follow
&& !ps.block_init
&& (ps.last_token == rparen || ps.last_token == semicolon ||
ps.last_token == decl ||
ps.last_token == lbrace || ps.last_token == rbrace)) {
ps.keyword = 4; /* a type name */
ps.last_u_d = true;
&& !state->p_l_follow
&& !state->block_init
&& (state->last_token == rparen || state->last_token == semicolon ||
state->last_token == decl ||
state->last_token == lbrace || state->last_token == rbrace)) {
state->keyword = 4; /* a type name */
state->last_u_d = true;
last_code = decl;
return decl;
}
if (last_code == decl) /* if this is a declared variable, then
* following sign is unary */
ps.last_u_d = true; /* will make "int a -1" work */
state->last_u_d = true; /* will make "int a -1" work */
last_code = ident;
return (ident); /* the ident is not in the list */
} /* end of procesing for alpanum character */
@ -403,8 +405,8 @@ lexi(void)
switch (*token) {
case '\n':
unary_delim = ps.last_u_d;
ps.last_nl = true; /* remember that we just had a newline */
unary_delim = state->last_u_d;
state->last_nl = true; /* remember that we just had a newline */
code = (had_eof ? 0 : newline);
/*
@ -473,7 +475,7 @@ lexi(void)
break;
case '#':
unary_delim = ps.last_u_d;
unary_delim = state->last_u_d;
code = preesc;
break;
@ -496,21 +498,21 @@ lexi(void)
unary_delim = true;
/*
* if (ps.in_or_st) ps.block_init = 1;
* if (state->in_or_st) state->block_init = 1;
*/
/* ? code = ps.block_init ? lparen : lbrace; */
/* ? code = state->block_init ? lparen : lbrace; */
code = lbrace;
break;
case ('}'):
unary_delim = true;
/* ? code = ps.block_init ? rparen : rbrace; */
/* ? code = state->block_init ? rparen : rbrace; */
code = rbrace;
break;
case 014: /* a form feed */
unary_delim = ps.last_u_d;
ps.last_nl = true; /* remember this so we can set 'ps.col_1'
unary_delim = state->last_u_d;
state->last_nl = true; /* remember this so we can set 'state->col_1'
* right */
code = form_feed;
break;
@ -527,7 +529,7 @@ lexi(void)
case '-':
case '+': /* check for -, +, --, ++ */
code = (ps.last_u_d ? unary_op : binary_op);
code = (state->last_u_d ? unary_op : binary_op);
unary_delim = true;
if (*buf_ptr == token[0]) {
@ -535,7 +537,7 @@ lexi(void)
*e_token++ = *buf_ptr++;
/* buffer overflow will be checked at end of loop */
if (last_code == ident || last_code == rparen) {
code = (ps.last_u_d ? unary_op : postop);
code = (state->last_u_d ? unary_op : postop);
/* check for following ++ or -- */
unary_delim = false;
}
@ -548,14 +550,14 @@ lexi(void)
*e_token++ = *buf_ptr++;
unary_delim = false;
code = unary_op;
ps.want_blank = false;
state->want_blank = false;
}
break; /* buffer overflow will be checked at end of
* switch */
case '=':
if (ps.in_or_st)
ps.block_init = 1;
if (state->in_or_st)
state->block_init = 1;
#ifdef undef
if (chartype[*buf_ptr] == opchar) { /* we have two char assignment */
e_token[-1] = *buf_ptr++;
@ -586,7 +588,7 @@ lexi(void)
}
if (*buf_ptr == '=')
*e_token++ = *buf_ptr++;
code = (ps.last_u_d ? unary_op : binary_op);
code = (state->last_u_d ? unary_op : binary_op);
unary_delim = true;
break;
@ -599,7 +601,7 @@ lexi(void)
fill_buffer();
code = comment;
unary_delim = ps.last_u_d;
unary_delim = state->last_u_d;
break;
}
while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
@ -610,7 +612,7 @@ lexi(void)
if (++buf_ptr >= buf_end)
fill_buffer();
}
code = (ps.last_u_d ? unary_op : binary_op);
code = (state->last_u_d ? unary_op : binary_op);
unary_delim = true;
@ -621,7 +623,7 @@ lexi(void)
}
if (buf_ptr >= buf_end) /* check for input buffer empty */
fill_buffer();
ps.last_u_d = unary_delim;
state->last_u_d = unary_delim;
*e_token = '\0'; /* null terminate the token */
return (code);
}