Vendor import of bwk's 29-Jul-2003 release.
This commit is contained in:
parent
153c5b8c9d
commit
6247f74063
@ -25,6 +25,52 @@ THIS SOFTWARE.
|
||||
This file lists all bug fixes, changes, etc., made since the AWK book
|
||||
was sent to the printers in August, 1987.
|
||||
|
||||
Jul 29, 2003:
|
||||
fixed (i think) the long-standing botch that included the beginning of
|
||||
line state ^ for RE's in the set of valid characters; this led to a
|
||||
variety of odd problems, including failure to properly match certain
|
||||
regular expressions in non-US locales. thanks to ruslan for keeping
|
||||
at this one.
|
||||
|
||||
Jul 28, 2003:
|
||||
n-th try at getting internationalization right, with thanks to volker
|
||||
kiefel, arnold robbins and ruslan ermilov for advice, though they
|
||||
should not be blamed for the outcome. according to posix, "." is the
|
||||
radix character in programs and command line arguments regardless of
|
||||
the locale; otherwise, the locale should prevail for input and output
|
||||
of numbers. so it's intended to work that way.
|
||||
|
||||
i have rescinded the attempt to use strcoll in expanding shorthands in
|
||||
regular expressions (cclenter). its properties are much too
|
||||
surprising; for example [a-c] matches aAbBc in locale en_US but abBcC
|
||||
in locale fr_CA. i can see how this might arise by implementation
|
||||
but i cannot explain it to a human user. (this behavior can be seen
|
||||
in gawk as well; we're leaning on the same library.)
|
||||
|
||||
the issue appears to be that strcoll is meant for sorting, where
|
||||
merging upper and lower case may make sense (though note that unix
|
||||
sort does not do this by default either). it is not appropriate
|
||||
for regular expressions, where the goal is to match specific
|
||||
patterns of characters. in any case, the notations [:lower:], etc.,
|
||||
are available in awk, and they are more likely to work correctly in
|
||||
most locales.
|
||||
|
||||
a moratorium is hereby declared on internationalization changes.
|
||||
i apologize to friends and colleagues in other parts of the world.
|
||||
i would truly like to get this "right", but i don't know what
|
||||
that is, and i do not want to keep making changes until it's clear.
|
||||
|
||||
Jul 4, 2003:
|
||||
fixed bug that permitted non-terminated RE, as in "awk /x".
|
||||
|
||||
Jun 1, 2003:
|
||||
subtle change to split: if source is empty, number of elems
|
||||
is always 0 and the array is not set.
|
||||
|
||||
Mar 21, 2003:
|
||||
added some parens to isblank, in another attempt to make things
|
||||
internationally portable.
|
||||
|
||||
Mar 14, 2003:
|
||||
the internationalization changes, somewhat modified, are now
|
||||
reinstated. in theory awk will now do character comparisons
|
||||
|
@ -33,7 +33,7 @@ THIS SOFTWARE.
|
||||
#include "awk.h"
|
||||
#include "ytab.h"
|
||||
|
||||
#define HAT (NCHARS-2) /* matches ^ in regular expr */
|
||||
#define HAT (NCHARS+2) /* matches ^ in regular expr */
|
||||
/* NCHARS is 2**n */
|
||||
#define MAXLIN 22
|
||||
|
||||
@ -282,24 +282,9 @@ int quoted(char **pp) /* pick up next thing after a \\ */
|
||||
return c;
|
||||
}
|
||||
|
||||
static int collate_range_cmp(int a, int b)
|
||||
{
|
||||
int r;
|
||||
static char s[2][2];
|
||||
|
||||
if ((uschar)a == (uschar)b)
|
||||
return 0;
|
||||
s[0][0] = a;
|
||||
s[1][0] = b;
|
||||
if ((r = strcoll(s[0], s[1])) == 0)
|
||||
r = (uschar)a - (uschar)b;
|
||||
return r;
|
||||
}
|
||||
|
||||
char *cclenter(const char *argp) /* add a character class */
|
||||
{
|
||||
int i, c, c2;
|
||||
int j;
|
||||
uschar *p = (uschar *) argp;
|
||||
uschar *op, *bp;
|
||||
static uschar *buf = 0;
|
||||
@ -318,18 +303,15 @@ char *cclenter(const char *argp) /* add a character class */
|
||||
c2 = *p++;
|
||||
if (c2 == '\\')
|
||||
c2 = quoted((char **) &p);
|
||||
if (collate_range_cmp(c, c2) > 0) { /* empty; ignore */
|
||||
if (c > c2) { /* empty; ignore */
|
||||
bp--;
|
||||
i--;
|
||||
continue;
|
||||
}
|
||||
for (j = 0; j < NCHARS; j++) {
|
||||
if ((collate_range_cmp(c, j) > 0) ||
|
||||
collate_range_cmp(j, c2) > 0)
|
||||
continue;
|
||||
while (c < c2) {
|
||||
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
|
||||
FATAL("out of space for character class [%.10s...] 2", p);
|
||||
*bp++ = j;
|
||||
*bp++ = ++c;
|
||||
i++;
|
||||
}
|
||||
continue;
|
||||
@ -718,11 +700,14 @@ Node *unary(Node *np)
|
||||
* system i use, it's defined here. if some other locale has a richer
|
||||
* definition of "blank", define HAS_ISBLANK and provide your own
|
||||
* version.
|
||||
* the parentheses here are an attempt to find a path through the maze
|
||||
* of macro definition and/or function and/or version provided. thanks
|
||||
* to nelson beebe for the suggestion; let's see if it works everywhere.
|
||||
*/
|
||||
|
||||
#ifndef HAS_ISBLANK
|
||||
|
||||
int isblank(int c)
|
||||
int (isblank)(int c)
|
||||
{
|
||||
return c==' ' || c=='\t';
|
||||
}
|
||||
@ -839,8 +824,6 @@ int cgoto(fa *f, int s, int c)
|
||||
int i, j, k;
|
||||
int *p, *q;
|
||||
|
||||
if (c < 0 || c > 255)
|
||||
FATAL("can't happen: neg char %d in cgoto", c);
|
||||
while (f->accept >= maxsetvec) { /* guessing here! */
|
||||
maxsetvec *= 4;
|
||||
setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
|
||||
|
@ -529,6 +529,8 @@ int regexpr(void)
|
||||
}
|
||||
}
|
||||
*bp = 0;
|
||||
if (c == 0)
|
||||
SYNTAX("non-terminated regular expression %.10s...", buf);
|
||||
yylval.s = tostring(buf);
|
||||
unput('/');
|
||||
RET(REGEXPR);
|
||||
|
@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
|
||||
THIS SOFTWARE.
|
||||
****************************************************************/
|
||||
|
||||
const char *version = "version 20030314";
|
||||
const char *version = "version 20030729";
|
||||
|
||||
#define DEBUG
|
||||
#include <stdio.h>
|
||||
@ -55,10 +55,8 @@ int main(int argc, char *argv[])
|
||||
{
|
||||
const char *fs = NULL;
|
||||
|
||||
setlocale(LC_ALL, "");
|
||||
setlocale(LC_COLLATE, "");
|
||||
setlocale(LC_CTYPE, "");
|
||||
setlocale(LC_MESSAGES, "");
|
||||
setlocale(LC_NUMERIC, "C"); /* for parsing cmdline & prog */
|
||||
cmdname = argv[0];
|
||||
if (argc == 1) {
|
||||
fprintf(stderr, "Usage: %s [-f programfile | 'program'] [-Ffieldsep] [-v var=value] [files]\n", cmdname);
|
||||
@ -147,6 +145,7 @@ int main(int argc, char *argv[])
|
||||
if (!safe)
|
||||
envinit(environ);
|
||||
yyparse();
|
||||
setlocale(LC_NUMERIC, ""); /* back to whatever it is locally */
|
||||
if (fs)
|
||||
*FS = qstring(fs, '\0');
|
||||
dprintf( ("errorflag=%d\n", errorflag) );
|
||||
|
@ -1221,7 +1221,7 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */
|
||||
ap->sval = (char *) makesymtab(NSYMTAB);
|
||||
|
||||
n = 0;
|
||||
if ((*s != '\0' && strlen(fs) > 1) || arg3type == REGEXPR) { /* reg expr */
|
||||
if (*s != '\0' && (strlen(fs) > 1 || arg3type == REGEXPR)) { /* reg expr */
|
||||
fa *pfa;
|
||||
if (arg3type == REGEXPR) { /* it's ready already */
|
||||
pfa = (fa *) a[2];
|
||||
|
Loading…
x
Reference in New Issue
Block a user