Vendor import of bwk's 29-Jul-2003 release.

This commit is contained in:
ru 2003-07-30 06:47:03 +00:00
parent 153c5b8c9d
commit 6247f74063
5 changed files with 60 additions and 30 deletions

View File

@ -25,6 +25,52 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the AWK book
was sent to the printers in August, 1987.
Jul 29, 2003:
fixed (i think) the long-standing botch that included the beginning of
line state ^ for RE's in the set of valid characters; this led to a
variety of odd problems, including failure to properly match certain
regular expressions in non-US locales. thanks to ruslan for keeping
at this one.
Jul 28, 2003:
n-th try at getting internationalization right, with thanks to volker
kiefel, arnold robbins and ruslan ermilov for advice, though they
should not be blamed for the outcome. according to posix, "." is the
radix character in programs and command line arguments regardless of
the locale; otherwise, the locale should prevail for input and output
of numbers. so it's intended to work that way.
i have rescinded the attempt to use strcoll in expanding shorthands in
regular expressions (cclenter). its properties are much too
surprising; for example [a-c] matches aAbBc in locale en_US but abBcC
in locale fr_CA. i can see how this might arise by implementation
but i cannot explain it to a human user. (this behavior can be seen
in gawk as well; we're leaning on the same library.)
the issue appears to be that strcoll is meant for sorting, where
merging upper and lower case may make sense (though note that unix
sort does not do this by default either). it is not appropriate
for regular expressions, where the goal is to match specific
patterns of characters. in any case, the notations [:lower:], etc.,
are available in awk, and they are more likely to work correctly in
most locales.
a moratorium is hereby declared on internationalization changes.
i apologize to friends and colleagues in other parts of the world.
i would truly like to get this "right", but i don't know what
that is, and i do not want to keep making changes until it's clear.
Jul 4, 2003:
fixed bug that permitted non-terminated RE, as in "awk /x".
Jun 1, 2003:
subtle change to split: if source is empty, number of elems
is always 0 and the array is not set.
Mar 21, 2003:
added some parens to isblank, in another attempt to make things
internationally portable.
Mar 14, 2003:
the internationalization changes, somewhat modified, are now
reinstated. in theory awk will now do character comparisons

View File

@ -33,7 +33,7 @@ THIS SOFTWARE.
#include "awk.h"
#include "ytab.h"
#define HAT (NCHARS-2) /* matches ^ in regular expr */
#define HAT (NCHARS+2) /* matches ^ in regular expr */
/* NCHARS is 2**n */
#define MAXLIN 22
@ -282,24 +282,9 @@ int quoted(char **pp) /* pick up next thing after a \\ */
return c;
}
static int collate_range_cmp(int a, int b)
{
int r;
static char s[2][2];
if ((uschar)a == (uschar)b)
return 0;
s[0][0] = a;
s[1][0] = b;
if ((r = strcoll(s[0], s[1])) == 0)
r = (uschar)a - (uschar)b;
return r;
}
char *cclenter(const char *argp) /* add a character class */
{
int i, c, c2;
int j;
uschar *p = (uschar *) argp;
uschar *op, *bp;
static uschar *buf = 0;
@ -318,18 +303,15 @@ char *cclenter(const char *argp) /* add a character class */
c2 = *p++;
if (c2 == '\\')
c2 = quoted((char **) &p);
if (collate_range_cmp(c, c2) > 0) { /* empty; ignore */
if (c > c2) { /* empty; ignore */
bp--;
i--;
continue;
}
for (j = 0; j < NCHARS; j++) {
if ((collate_range_cmp(c, j) > 0) ||
collate_range_cmp(j, c2) > 0)
continue;
while (c < c2) {
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
FATAL("out of space for character class [%.10s...] 2", p);
*bp++ = j;
*bp++ = ++c;
i++;
}
continue;
@ -718,11 +700,14 @@ Node *unary(Node *np)
* system i use, it's defined here. if some other locale has a richer
* definition of "blank", define HAS_ISBLANK and provide your own
* version.
* the parentheses here are an attempt to find a path through the maze
* of macro definition and/or function and/or version provided. thanks
* to nelson beebe for the suggestion; let's see if it works everywhere.
*/
#ifndef HAS_ISBLANK
int isblank(int c)
int (isblank)(int c)
{
return c==' ' || c=='\t';
}
@ -839,8 +824,6 @@ int cgoto(fa *f, int s, int c)
int i, j, k;
int *p, *q;
if (c < 0 || c > 255)
FATAL("can't happen: neg char %d in cgoto", c);
while (f->accept >= maxsetvec) { /* guessing here! */
maxsetvec *= 4;
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));

View File

@ -529,6 +529,8 @@ int regexpr(void)
}
}
*bp = 0;
if (c == 0)
SYNTAX("non-terminated regular expression %.10s...", buf);
yylval.s = tostring(buf);
unput('/');
RET(REGEXPR);

View File

@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
const char *version = "version 20030314";
const char *version = "version 20030729";
#define DEBUG
#include <stdio.h>
@ -55,10 +55,8 @@ int main(int argc, char *argv[])
{
const char *fs = NULL;
setlocale(LC_ALL, "");
setlocale(LC_COLLATE, "");
setlocale(LC_CTYPE, "");
setlocale(LC_MESSAGES, "");
setlocale(LC_NUMERIC, "C"); /* for parsing cmdline & prog */
cmdname = argv[0];
if (argc == 1) {
fprintf(stderr, "Usage: %s [-f programfile | 'program'] [-Ffieldsep] [-v var=value] [files]\n", cmdname);
@ -147,6 +145,7 @@ int main(int argc, char *argv[])
if (!safe)
envinit(environ);
yyparse();
setlocale(LC_NUMERIC, ""); /* back to whatever it is locally */
if (fs)
*FS = qstring(fs, '\0');
dprintf( ("errorflag=%d\n", errorflag) );

View File

@ -1221,7 +1221,7 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */
ap->sval = (char *) makesymtab(NSYMTAB);
n = 0;
if ((*s != '\0' && strlen(fs) > 1) || arg3type == REGEXPR) { /* reg expr */
if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) { /* reg expr */
fa *pfa;
if (arg3type == REGEXPR) { /* it's ready already */
pfa = (fa *) a[2];