indent(1): improve predictability of lexi()

lexi() reads the input stream and categorizes the next token. indent will sometimes buffer up a sequence of tokens in order rearrange them. That is needed for properly cuddling else or placing braces correctly according to the chosen style (KNF vs Allman) when comments are around. The loop that buffers tokens up uses lexi() to decide if it's time to stop buffering. Then the temporary buffer is used to feed lexi() the same tokens again, this time for normal processing. The problem is that lexi() apart from recognizing the token, can change a lot of information about the current state, for example ps.last_nl, ps.keyword, buf_ptr. It also abandons leading whitespace, which is needed mainly for comment-related considerations. So the call to lexi() while tokens are buffered up and categorized can change the state before they're read again for normal processing which may easily result in changing interpretation of the current state and lead to incorrect output. To work around the problems: 1) copy the whitespace into the save_com buffer so that it will be read again when processed 2) trick lexi() into modifying a temporary copy of the parser state instead of the original.
svn path=/head/; revision=334560
2018-06-03 14:13:11 +00:00 · 2018-06-03 14:13:11 +00:00 · 63c3f22696 · 2020-12-20 02:59:44 +00:00
commit 63c3f22696
parent ec5ac89ecd
3 changed files with 93 additions and 51 deletions
--- a/usr.bin/indent/indent.c
+++ b/usr.bin/indent/indent.c
@ -102,6 +102,7 @@ main(int argc, char **argv)
    int         last_else = 0;	/* true iff last keyword was an else */
    const char *profile_name = NULL;
    const char *envval = NULL;
+    struct parser_state transient_state; /* a copy for lookup */

    /*-----------------------------------------------*\
    |		      INITIALIZATION		      |
@ -324,7 +325,7 @@ main(int argc, char **argv)
 	int         is_procname;
 	int comment_buffered = false;

-	type_code = lexi();	/* lexi reads one token.  The actual
+	type_code = lexi(&ps);	/* lexi reads one token.  The actual
 				 * characters read are stored in "token". lexi
 				 * returns a code indicating the type of token */
 	is_procname = ps.procname[0];
@ -460,9 +461,48 @@ main(int argc, char **argv)
 		    break;
 		}
 	    }			/* end of switch */
-	    if (type_code != 0)	/* we must make this check, just in case there
-				 * was an unexpected EOF */
-		type_code = lexi();	/* read another token */
+	    /*
+	     * We must make this check, just in case there was an unexpected
+	     * EOF.
+	     */
+	    if (type_code != 0) {
+		/*
+		 * The only intended purpose of calling lexi() below is to
+		 * categorize the next token in order to decide whether to
+		 * continue buffering forthcoming tokens. Once the buffering
+		 * is over, lexi() will be called again elsewhere on all of
+		 * the tokens - this time for normal processing.
+		 *
+		 * Calling it for this purpose is a bug, because lexi() also
+		 * changes the parser state and discards leading whitespace,
+		 * which is needed mostly for comment-related considerations.
+		 *
+		 * Work around the former problem by giving lexi() a copy of
+		 * the current parser state and discard it if the call turned
+		 * out to be just a look ahead.
+		 *
+		 * Work around the latter problem by copying all whitespace
+		 * characters into the buffer so that the later lexi() call
+		 * will read them.
+		 */
+		if (sc_end != NULL) {
+		    while (*buf_ptr == ' ' || *buf_ptr == '\t') {
+			*sc_end++ = *buf_ptr++;
+			if (sc_end >= &save_com[sc_size]) {
+			    abort();
+			}
+		    }
+		    if (buf_ptr >= buf_end) {
+			fill_buffer();
+		    }
+		}
+		transient_state = ps;
+		type_code = lexi(&transient_state);	/* read another token */
+		if (type_code != newline && type_code != form_feed &&
+		    type_code != comment && !transient_state.search_brace) {
+		    ps = transient_state;
+		}
+	    }
 	}			/* end of while (search_brace) */
 	last_else = 0;
 check_type:
--- a/usr.bin/indent/indent.h
+++ b/usr.bin/indent/indent.h
@ -36,7 +36,7 @@ int	compute_code_target(void);
 int	compute_label_target(void);
 int	count_spaces(int, char *);
 int	count_spaces_until(int, char *, char *);
-int	lexi(void);
+int	lexi(struct parser_state *);
 void	diag2(int, const char *);
 void	diag3(int, const char *, int);
 void	diag4(int, const char *, int, int);
--- a/usr.bin/indent/lexi.c
+++ b/usr.bin/indent/lexi.c
@ -141,7 +141,7 @@ strcmp_type(const void *e1, const void *e2)
 }

 int
-lexi(void)
+lexi(struct parser_state *state)
 {
    int         unary_delim;	/* this is set to 1 if the current token
 				 * forces a following operator to be unary */
@ -152,12 +152,13 @@ lexi(void)

    e_token = s_token;		/* point to start of place to save token */
    unary_delim = false;
-    ps.col_1 = ps.last_nl;	/* tell world that this token started in
-				 * column 1 iff the last thing scanned was nl */
-    ps.last_nl = false;
+    state->col_1 = state->last_nl;	/* tell world that this token started
+					 * in column 1 iff the last thing
+					 * scanned was a newline */
+    state->last_nl = false;

    while (*buf_ptr == ' ' || *buf_ptr == '\t') {	/* get rid of blanks */
-	ps.col_1 = false;	/* leading blanks imply token is not in column
+	state->col_1 = false;	/* leading blanks imply token is not in column
 				 * 1 */
 	if (++buf_ptr >= buf_end)
 	    fill_buffer();
@ -281,18 +282,19 @@ lexi(void)
 	    if (++buf_ptr >= buf_end)
 		fill_buffer();
 	}
-	ps.keyword = 0;
-	if (l_struct && !ps.p_l_follow) {
+	state->keyword = 0;
+	if (l_struct && !state->p_l_follow) {
 				/* if last token was 'struct' and we're not
 				 * in parentheses, then this token
 				 * should be treated as a declaration */
 	    l_struct = false;
 	    last_code = ident;
-	    ps.last_u_d = true;
+	    state->last_u_d = true;
 	    return (decl);
 	}
-	ps.last_u_d = l_struct;	/* Operator after identifier is binary
-				 * unless last token was 'struct' */
+	state->last_u_d = l_struct;	/* Operator after identifier is
+					 * binary unless last token was
+					 * 'struct' */
 	l_struct = false;
 	last_code = ident;	/* Remember that this is the code we will
 				 * return */
@ -310,13 +312,13 @@ lexi(void)
 	        strcmp(u, "_t") == 0) || (typename_top >= 0 &&
 		  bsearch(s_token, typenames, typename_top + 1,
 		    sizeof(typenames[0]), strcmp_type))) {
-		ps.keyword = 4;	/* a type name */
-		ps.last_u_d = true;
+		state->keyword = 4;	/* a type name */
+		state->last_u_d = true;
 	        goto found_typename;
 	    }
 	} else {			/* we have a keyword */
-	    ps.keyword = p->rwcode;
-	    ps.last_u_d = true;
+	    state->keyword = p->rwcode;
+	    state->last_u_d = true;
 	    switch (p->rwcode) {
 	    case 7:		/* it is a switch */
 		return (swstmt);
@ -333,9 +335,9 @@ lexi(void)

 	    case 4:		/* one of the declaration keywords */
 	    found_typename:
-		if (ps.p_l_follow) {
+		if (state->p_l_follow) {
 		    /* inside parens: cast, param list, offsetof or sizeof */
-		    ps.cast_mask |= (1 << ps.p_l_follow) & ~ps.not_cast_mask;
+		    state->cast_mask |= (1 << state->p_l_follow) & ~state->not_cast_mask;
 		    break;
 		}
 		last_code = decl;
@ -358,15 +360,15 @@ lexi(void)
 		return (ident);
 	    }			/* end of switch */
 	}			/* end of if (found_it) */
-	if (*buf_ptr == '(' && ps.tos <= 1 && ps.ind_level == 0 &&
-	    ps.in_parameter_declaration == 0 && ps.block_init == 0) {
+	if (*buf_ptr == '(' && state->tos <= 1 && state->ind_level == 0 &&
+	    state->in_parameter_declaration == 0 && state->block_init == 0) {
 	    char *tp = buf_ptr;
 	    while (tp < buf_end)
 		if (*tp++ == ')' && (*tp == ';' || *tp == ','))
 		    goto not_proc;
-	    strncpy(ps.procname, token, sizeof ps.procname - 1);
-	    if (ps.in_decl)
-		ps.in_parameter_declaration = 1;
+	    strncpy(state->procname, token, sizeof state->procname - 1);
+	    if (state->in_decl)
+		state->in_parameter_declaration = 1;
 	    return (last_code = funcname);
    not_proc:;
 	}
@ -376,19 +378,19 @@ lexi(void)
 	 * typedefd
 	 */
 	if (((*buf_ptr == '*' && buf_ptr[1] != '=') || isalpha(*buf_ptr) || *buf_ptr == '_')
-		&& !ps.p_l_follow
-	        && !ps.block_init
-		&& (ps.last_token == rparen || ps.last_token == semicolon ||
-		    ps.last_token == decl ||
-		    ps.last_token == lbrace || ps.last_token == rbrace)) {
-	    ps.keyword = 4;	/* a type name */
-	    ps.last_u_d = true;
+		&& !state->p_l_follow
+	        && !state->block_init
+		&& (state->last_token == rparen || state->last_token == semicolon ||
+		    state->last_token == decl ||
+		    state->last_token == lbrace || state->last_token == rbrace)) {
+	    state->keyword = 4;	/* a type name */
+	    state->last_u_d = true;
 	    last_code = decl;
 	    return decl;
 	}
 	if (last_code == decl)	/* if this is a declared variable, then
 				 * following sign is unary */
-	    ps.last_u_d = true;	/* will make "int a -1" work */
+	    state->last_u_d = true;	/* will make "int a -1" work */
 	last_code = ident;
 	return (ident);		/* the ident is not in the list */
    }				/* end of procesing for alpanum character */
@ -403,8 +405,8 @@ lexi(void)

    switch (*token) {
    case '\n':
-	unary_delim = ps.last_u_d;
-	ps.last_nl = true;	/* remember that we just had a newline */
+	unary_delim = state->last_u_d;
+	state->last_nl = true;	/* remember that we just had a newline */
 	code = (had_eof ? 0 : newline);

 	/*
@ -473,7 +475,7 @@ lexi(void)
 	break;

    case '#':
-	unary_delim = ps.last_u_d;
+	unary_delim = state->last_u_d;
 	code = preesc;
 	break;

@ -496,21 +498,21 @@ lexi(void)
 	unary_delim = true;

 	/*
-	 * if (ps.in_or_st) ps.block_init = 1;
+	 * if (state->in_or_st) state->block_init = 1;
 	 */
-	/* ?	code = ps.block_init ? lparen : lbrace; */
+	/* ?	code = state->block_init ? lparen : lbrace; */
 	code = lbrace;
 	break;

    case ('}'):
 	unary_delim = true;
-	/* ?	code = ps.block_init ? rparen : rbrace; */
+	/* ?	code = state->block_init ? rparen : rbrace; */
 	code = rbrace;
 	break;

    case 014:			/* a form feed */
-	unary_delim = ps.last_u_d;
-	ps.last_nl = true;	/* remember this so we can set 'ps.col_1'
+	unary_delim = state->last_u_d;
+	state->last_nl = true;	/* remember this so we can set 'state->col_1'
 				 * right */
 	code = form_feed;
 	break;
@ -527,7 +529,7 @@ lexi(void)

    case '-':
    case '+':			/* check for -, +, --, ++ */
-	code = (ps.last_u_d ? unary_op : binary_op);
+	code = (state->last_u_d ? unary_op : binary_op);
 	unary_delim = true;

 	if (*buf_ptr == token[0]) {
@ -535,7 +537,7 @@ lexi(void)
 	    *e_token++ = *buf_ptr++;
 	    /* buffer overflow will be checked at end of loop */
 	    if (last_code == ident || last_code == rparen) {
-		code = (ps.last_u_d ? unary_op : postop);
+		code = (state->last_u_d ? unary_op : postop);
 		/* check for following ++ or -- */
 		unary_delim = false;
 	    }
@ -548,14 +550,14 @@ lexi(void)
 	    *e_token++ = *buf_ptr++;
 	    unary_delim = false;
 	    code = unary_op;
-	    ps.want_blank = false;
+	    state->want_blank = false;
 	}
 	break;			/* buffer overflow will be checked at end of
 				 * switch */

    case '=':
-	if (ps.in_or_st)
-	    ps.block_init = 1;
+	if (state->in_or_st)
+	    state->block_init = 1;
 #ifdef undef
 	if (chartype[*buf_ptr] == opchar) {	/* we have two char assignment */
 	    e_token[-1] = *buf_ptr++;
@ -586,7 +588,7 @@ lexi(void)
 	}
 	if (*buf_ptr == '=')
 	    *e_token++ = *buf_ptr++;
-	code = (ps.last_u_d ? unary_op : binary_op);
+	code = (state->last_u_d ? unary_op : binary_op);
 	unary_delim = true;
 	break;

@ -599,7 +601,7 @@ lexi(void)
 		fill_buffer();

 	    code = comment;
-	    unary_delim = ps.last_u_d;
+	    unary_delim = state->last_u_d;
 	    break;
 	}
 	while (*(e_token - 1) == *buf_ptr || *buf_ptr == '=') {
@ -610,7 +612,7 @@ lexi(void)
 	    if (++buf_ptr >= buf_end)
 		fill_buffer();
 	}
-	code = (ps.last_u_d ? unary_op : binary_op);
+	code = (state->last_u_d ? unary_op : binary_op);
 	unary_delim = true;


@ -621,7 +623,7 @@ lexi(void)
    }
    if (buf_ptr >= buf_end)	/* check for input buffer empty */
 	fill_buffer();
-    ps.last_u_d = unary_delim;
+    state->last_u_d = unary_delim;
    *e_token = '\0';		/* null terminate the token */
    return (code);
 }