7551d83c35
Mainly focus on files that use BSD 2-Clause license, however the tool I was using misidentified many licenses so this was mostly a manual - error prone - task. The Software Package Data Exchange (SPDX) group provides a specification to make it easier for automated tools to detect and summarize well known opensource licenses. We are gradually adopting the specification, noting that the tags are considered only advisory and do not, in any way, superceed or replace the license texts. No functional change intended.
1071 lines
29 KiB
C
1071 lines
29 KiB
C
/* $FreeBSD$ */
|
|
|
|
/*-
|
|
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
|
|
*
|
|
* Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
|
|
* Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "glue.h"
|
|
|
|
#include <ctype.h>
|
|
#include <limits.h>
|
|
#include <regex.h>
|
|
#include <stdbool.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#ifdef TRE_WCHAR
|
|
#include <wchar.h>
|
|
#include <wctype.h>
|
|
#endif
|
|
|
|
#include "hashtable.h"
|
|
#include "tre-fastmatch.h"
|
|
|
|
static int fastcmp(const fastmatch_t *fg, const void *data,
|
|
tre_str_type_t type);
|
|
|
|
/*
|
|
* Clean up if pattern compilation fails.
|
|
*/
|
|
#define FAIL_COMP(errcode) \
|
|
{ \
|
|
if (fg->pattern) \
|
|
free(fg->pattern); \
|
|
if (fg->wpattern) \
|
|
free(fg->wpattern); \
|
|
if (fg->qsBc_table) \
|
|
hashtable_free(fg->qsBc_table); \
|
|
fg = NULL; \
|
|
return errcode; \
|
|
}
|
|
|
|
/*
|
|
* Skips n characters in the input string and assigns the start
|
|
* address to startptr. Note: as per IEEE Std 1003.1-2008
|
|
* matching is based on bit pattern not character representations
|
|
* so we can handle MB strings as byte sequences just like
|
|
* SB strings.
|
|
*/
|
|
#define SKIP_CHARS(n) \
|
|
switch (type) \
|
|
{ \
|
|
case STR_WIDE: \
|
|
startptr = str_wide + n; \
|
|
break; \
|
|
default: \
|
|
startptr = str_byte + n; \
|
|
}
|
|
|
|
/*
|
|
* Converts the wide string pattern to SB/MB string and stores
|
|
* it in fg->pattern. Sets fg->len to the byte length of the
|
|
* converted string.
|
|
*/
|
|
#define STORE_MBS_PAT \
|
|
{ \
|
|
size_t siz; \
|
|
\
|
|
siz = wcstombs(NULL, fg->wpattern, 0); \
|
|
if (siz == (size_t)-1) \
|
|
return REG_BADPAT; \
|
|
fg->len = siz; \
|
|
fg->pattern = malloc(siz + 1); \
|
|
if (fg->pattern == NULL) \
|
|
return REG_ESPACE; \
|
|
wcstombs(fg->pattern, fg->wpattern, siz); \
|
|
fg->pattern[siz] = '\0'; \
|
|
} \
|
|
|
|
#define CONV_MBS_PAT(src, dest, destsz) \
|
|
{ \
|
|
destsz = wcstombs(NULL, src, 0); \
|
|
if (destsz == (size_t)-1) \
|
|
return REG_BADPAT; \
|
|
dest = malloc(destsz + 1); \
|
|
if (dest == NULL) \
|
|
return REG_ESPACE; \
|
|
wcstombs(dest, src, destsz); \
|
|
dest[destsz] = '\0'; \
|
|
} \
|
|
|
|
#define IS_OUT_OF_BOUNDS \
|
|
((!fg->reversed \
|
|
? ((type == STR_WIDE) ? ((j + fg->wlen) > len) \
|
|
: ((j + fg->len) > len)) \
|
|
: (j < 0)))
|
|
|
|
/*
|
|
* Checks whether the new position after shifting in the input string
|
|
* is out of the bounds and break out from the loop if so.
|
|
*/
|
|
#define CHECKBOUNDS \
|
|
if (IS_OUT_OF_BOUNDS) \
|
|
break; \
|
|
|
|
/*
|
|
* Shifts in the input string after a mismatch. The position of the
|
|
* mismatch is stored in the mismatch variable.
|
|
*/
|
|
#define SHIFT \
|
|
CHECKBOUNDS; \
|
|
\
|
|
{ \
|
|
int r = -1; \
|
|
unsigned int bc = 0, gs = 0, ts; \
|
|
\
|
|
switch (type) \
|
|
{ \
|
|
case STR_WIDE: \
|
|
if (!fg->hasdot) \
|
|
{ \
|
|
if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift) \
|
|
mismatch -= u; \
|
|
v = fg->wlen - 1 - mismatch; \
|
|
r = hashtable_get(fg->qsBc_table, \
|
|
&str_wide[!fg->reversed ? (size_t)j + fg->wlen \
|
|
: (size_t)j - 1], &bc); \
|
|
gs = fg->bmGs[mismatch]; \
|
|
} \
|
|
bc = (r == HASH_OK) ? bc : fg->defBc; \
|
|
DPRINT(("tre_fast_match: mismatch on character" CHF ", " \
|
|
"BC %d, GS %d\n", \
|
|
((const tre_char_t *)startptr)[mismatch + 1], \
|
|
bc, gs)); \
|
|
break; \
|
|
default: \
|
|
if (!fg->hasdot) \
|
|
{ \
|
|
if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift) \
|
|
mismatch -= u; \
|
|
v = fg->len - 1 - mismatch; \
|
|
gs = fg->sbmGs[mismatch]; \
|
|
} \
|
|
bc = fg->qsBc[((const unsigned char *)str_byte) \
|
|
[!fg->reversed ? (size_t)j + fg->len \
|
|
: (size_t)j - 1]]; \
|
|
DPRINT(("tre_fast_match: mismatch on character %c, " \
|
|
"BC %d, GS %d\n", \
|
|
((const unsigned char *)startptr)[mismatch + 1], \
|
|
bc, gs)); \
|
|
} \
|
|
if (fg->hasdot) \
|
|
shift = bc; \
|
|
else \
|
|
{ \
|
|
ts = (u >= v) ? (u - v) : 0; \
|
|
shift = MAX(ts, bc); \
|
|
shift = MAX(shift, gs); \
|
|
if (shift == gs) \
|
|
u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v); \
|
|
else \
|
|
{ \
|
|
if (ts < bc) \
|
|
shift = MAX(shift, u + 1); \
|
|
u = 0; \
|
|
} \
|
|
} \
|
|
DPRINT(("tre_fast_match: shifting %u characters\n", shift)); \
|
|
j = !fg->reversed ? j + shift : j - shift; \
|
|
}
|
|
|
|
/*
|
|
* Normal Quick Search would require a shift based on the position the
|
|
* next character after the comparison is within the pattern. With
|
|
* wildcards, the position of the last dot effects the maximum shift
|
|
* distance.
|
|
* The closer to the end the wild card is the slower the search.
|
|
*
|
|
* Examples:
|
|
* Pattern Max shift
|
|
* ------- ---------
|
|
* this 5
|
|
* .his 4
|
|
* t.is 3
|
|
* th.s 2
|
|
* thi. 1
|
|
*/
|
|
|
|
/*
|
|
* Fills in the bad character shift array for SB/MB strings.
|
|
*/
|
|
#define FILL_QSBC \
|
|
if (fg->reversed) \
|
|
{ \
|
|
_FILL_QSBC_REVERSED \
|
|
} \
|
|
else \
|
|
{ \
|
|
_FILL_QSBC \
|
|
}
|
|
|
|
#define _FILL_QSBC \
|
|
for (unsigned int i = 0; i <= UCHAR_MAX; i++) \
|
|
fg->qsBc[i] = fg->len - hasdot; \
|
|
for (unsigned int i = hasdot + 1; i < fg->len; i++) \
|
|
{ \
|
|
fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i; \
|
|
DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i], \
|
|
fg->len - i)); \
|
|
if (fg->icase) \
|
|
{ \
|
|
char c = islower((unsigned char)fg->pattern[i]) ? \
|
|
toupper((unsigned char)fg->pattern[i]) : \
|
|
tolower((unsigned char)fg->pattern[i]); \
|
|
fg->qsBc[(unsigned char)c] = fg->len - i; \
|
|
DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i)); \
|
|
} \
|
|
}
|
|
|
|
#define _FILL_QSBC_REVERSED \
|
|
for (unsigned int i = 0; i <= UCHAR_MAX; i++) \
|
|
fg->qsBc[i] = firstdot + 1; \
|
|
for (int i = firstdot - 1; i >= 0; i--) \
|
|
{ \
|
|
fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1; \
|
|
DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i], \
|
|
i + 1)); \
|
|
if (fg->icase) \
|
|
{ \
|
|
char c = islower((unsigned char)fg->pattern[i]) ? \
|
|
toupper((unsigned char)fg->pattern[i]) : \
|
|
tolower((unsigned char)fg->pattern[i]); \
|
|
fg->qsBc[(unsigned char)c] = i + 1; \
|
|
DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1)); \
|
|
} \
|
|
}
|
|
|
|
/*
|
|
* Fills in the bad character shifts into a hastable for wide strings.
|
|
* With wide characters it is not possible any more to use a normal
|
|
* array because there are too many characters and we could not
|
|
* provide enough memory. Fortunately, we only have to store distinct
|
|
* values for so many characters as the number of distinct characters
|
|
* in the pattern, so we can store them in a hashtable and store a
|
|
* default shift value for the rest.
|
|
*/
|
|
#define FILL_QSBC_WIDE \
|
|
if (fg->reversed) \
|
|
{ \
|
|
_FILL_QSBC_WIDE_REVERSED \
|
|
} \
|
|
else \
|
|
{ \
|
|
_FILL_QSBC_WIDE \
|
|
}
|
|
|
|
#define _FILL_QSBC_WIDE \
|
|
/* Adjust the shift based on location of the last dot ('.'). */ \
|
|
fg->defBc = fg->wlen - whasdot; \
|
|
\
|
|
/* Preprocess pattern. */ \
|
|
fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \
|
|
sizeof(tre_char_t), sizeof(int)); \
|
|
if (!fg->qsBc_table) \
|
|
FAIL_COMP(REG_ESPACE); \
|
|
for (unsigned int i = whasdot + 1; i < fg->wlen; i++) \
|
|
{ \
|
|
int k = fg->wlen - i; \
|
|
int r; \
|
|
\
|
|
r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
|
|
if ((r == HASH_FAIL) || (r == HASH_FULL)) \
|
|
FAIL_COMP(REG_ESPACE); \
|
|
DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\
|
|
k)); \
|
|
if (fg->icase) \
|
|
{ \
|
|
tre_char_t wc = iswlower(fg->wpattern[i]) ? \
|
|
towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \
|
|
r = hashtable_put(fg->qsBc_table, &wc, &k); \
|
|
if ((r == HASH_FAIL) || (r == HASH_FULL)) \
|
|
FAIL_COMP(REG_ESPACE); \
|
|
DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k)); \
|
|
} \
|
|
}
|
|
|
|
#define _FILL_QSBC_WIDE_REVERSED \
|
|
/* Adjust the shift based on location of the last dot ('.'). */ \
|
|
fg->defBc = (size_t)wfirstdot; \
|
|
\
|
|
/* Preprocess pattern. */ \
|
|
fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \
|
|
sizeof(tre_char_t), sizeof(int)); \
|
|
if (!fg->qsBc_table) \
|
|
FAIL_COMP(REG_ESPACE); \
|
|
for (int i = wfirstdot - 1; i >= 0; i--) \
|
|
{ \
|
|
int k = i + 1; \
|
|
int r; \
|
|
\
|
|
r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
|
|
if ((r == HASH_FAIL) || (r == HASH_FULL)) \
|
|
FAIL_COMP(REG_ESPACE); \
|
|
DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", \
|
|
fg->wpattern[i], k)); \
|
|
if (fg->icase) \
|
|
{ \
|
|
tre_char_t wc = iswlower(fg->wpattern[i]) ? \
|
|
towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \
|
|
r = hashtable_put(fg->qsBc_table, &wc, &k); \
|
|
if ((r == HASH_FAIL) || (r == HASH_FULL)) \
|
|
FAIL_COMP(REG_ESPACE); \
|
|
DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \
|
|
k)); \
|
|
} \
|
|
}
|
|
|
|
#ifdef _GREP_DEBUG
|
|
#define DPRINT_BMGS(len, fmt_str, sh) \
|
|
for (unsigned int i = 0; i < len; i++) \
|
|
DPRINT((fmt_str, i, sh[i]));
|
|
#else
|
|
#define DPRINT_BMGS(len, fmt_str, sh) \
|
|
do { } while(/*CONSTCOND*/0)
|
|
#endif
|
|
|
|
/*
|
|
* Fills in the good suffix table for SB/MB strings.
|
|
*/
|
|
#define FILL_BMGS \
|
|
if (fg->len > 0 && !fg->hasdot) \
|
|
{ \
|
|
fg->sbmGs = malloc(fg->len * sizeof(*fg->sbmGs)); \
|
|
if (!fg->sbmGs) \
|
|
return REG_ESPACE; \
|
|
if (fg->len == 1) \
|
|
fg->sbmGs[0] = 1; \
|
|
else \
|
|
_FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \
|
|
DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs); \
|
|
}
|
|
|
|
/*
|
|
* Fills in the good suffix table for wide strings.
|
|
*/
|
|
#define FILL_BMGS_WIDE \
|
|
if (fg->wlen > 0 && !fg->hasdot) \
|
|
{ \
|
|
fg->bmGs = malloc(fg->wlen * sizeof(*fg->bmGs)); \
|
|
if (!fg->bmGs) \
|
|
return REG_ESPACE; \
|
|
if (fg->wlen == 1) \
|
|
fg->bmGs[0] = 1; \
|
|
else \
|
|
_FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \
|
|
DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n", \
|
|
fg->bmGs); \
|
|
}
|
|
|
|
#define _FILL_BMGS(arr, pat, plen, wide) \
|
|
{ \
|
|
char *p; \
|
|
tre_char_t *wp; \
|
|
\
|
|
if (wide) \
|
|
{ \
|
|
if (fg->icase) \
|
|
{ \
|
|
wp = malloc(plen * sizeof(tre_char_t)); \
|
|
if (wp == NULL) \
|
|
return REG_ESPACE; \
|
|
for (unsigned int i = 0; i < plen; i++) \
|
|
wp[i] = towlower(pat[i]); \
|
|
_CALC_BMGS(arr, wp, plen); \
|
|
free(wp); \
|
|
} \
|
|
else \
|
|
_CALC_BMGS(arr, pat, plen); \
|
|
} \
|
|
else \
|
|
{ \
|
|
if (fg->icase) \
|
|
{ \
|
|
p = malloc(plen); \
|
|
if (p == NULL) \
|
|
return REG_ESPACE; \
|
|
for (unsigned int i = 0; i < plen; i++) \
|
|
p[i] = tolower((unsigned char)pat[i]); \
|
|
_CALC_BMGS(arr, p, plen); \
|
|
free(p); \
|
|
} \
|
|
else \
|
|
_CALC_BMGS(arr, pat, plen); \
|
|
} \
|
|
}
|
|
|
|
#define _CALC_BMGS(arr, pat, plen) \
|
|
{ \
|
|
int f = 0, g; \
|
|
\
|
|
int *suff = malloc(plen * sizeof(int)); \
|
|
if (suff == NULL) \
|
|
return REG_ESPACE; \
|
|
\
|
|
suff[plen - 1] = plen; \
|
|
g = plen - 1; \
|
|
for (int i = plen - 2; i >= 0; i--) \
|
|
{ \
|
|
if (i > g && suff[i + plen - 1 - f] < i - g) \
|
|
suff[i] = suff[i + plen - 1 - f]; \
|
|
else \
|
|
{ \
|
|
if (i < g) \
|
|
g = i; \
|
|
f = i; \
|
|
while (g >= 0 && pat[g] == pat[g + plen - 1 - f]) \
|
|
g--; \
|
|
suff[i] = f - g; \
|
|
} \
|
|
} \
|
|
\
|
|
for (unsigned int i = 0; i < plen; i++) \
|
|
arr[i] = plen; \
|
|
g = 0; \
|
|
for (int i = plen - 1; i >= 0; i--) \
|
|
if (suff[i] == i + 1) \
|
|
for(; (unsigned long)g < plen - 1 - i; g++) \
|
|
if (arr[g] == plen) \
|
|
arr[g] = plen - 1 - i; \
|
|
for (unsigned int i = 0; i <= plen - 2; i++) \
|
|
arr[plen - 1 - suff[i]] = plen - 1 - i; \
|
|
\
|
|
free(suff); \
|
|
}
|
|
|
|
/*
|
|
* Copies the pattern pat having length n to p and stores
|
|
* the size in l.
|
|
*/
|
|
#define SAVE_PATTERN(src, srclen, dst, dstlen) \
|
|
dstlen = srclen; \
|
|
dst = malloc((dstlen + 1) * sizeof(tre_char_t)); \
|
|
if (dst == NULL) \
|
|
return REG_ESPACE; \
|
|
if (dstlen > 0) \
|
|
memcpy(dst, src, dstlen * sizeof(tre_char_t)); \
|
|
dst[dstlen] = TRE_CHAR('\0');
|
|
|
|
/*
|
|
* Initializes pattern compiling.
|
|
*/
|
|
#define INIT_COMP \
|
|
/* Initialize. */ \
|
|
memset(fg, 0, sizeof(*fg)); \
|
|
fg->icase = (cflags & REG_ICASE); \
|
|
fg->word = (cflags & REG_WORD); \
|
|
fg->newline = (cflags & REG_NEWLINE); \
|
|
fg->nosub = (cflags & REG_NOSUB); \
|
|
\
|
|
/* Cannot handle REG_ICASE with MB string */ \
|
|
if (fg->icase && (TRE_MB_CUR_MAX > 1) && n > 0) \
|
|
{ \
|
|
DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n")); \
|
|
return REG_BADPAT; \
|
|
}
|
|
|
|
/*
|
|
* Checks whether we have a 0-length pattern that will match
|
|
* anything. If literal is set to false, the EOL anchor is also
|
|
* taken into account.
|
|
*/
|
|
#define CHECK_MATCHALL(literal) \
|
|
if (!literal && n == 1 && pat[0] == TRE_CHAR('$')) \
|
|
{ \
|
|
n--; \
|
|
fg->eol = true; \
|
|
} \
|
|
\
|
|
if (n == 0) \
|
|
{ \
|
|
fg->matchall = true; \
|
|
fg->pattern = malloc(sizeof(char)); \
|
|
if (!fg->pattern) \
|
|
FAIL_COMP(REG_ESPACE); \
|
|
fg->pattern[0] = '\0'; \
|
|
fg->wpattern = malloc(sizeof(tre_char_t)); \
|
|
if (!fg->wpattern) \
|
|
FAIL_COMP(REG_ESPACE); \
|
|
fg->wpattern[0] = TRE_CHAR('\0'); \
|
|
DPRINT(("Matching every input\n")); \
|
|
return REG_OK; \
|
|
}
|
|
|
|
/*
|
|
* Returns: REG_OK on success, error code otherwise
|
|
*/
|
|
int
|
|
tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
|
|
int cflags)
|
|
{
|
|
size_t hasdot = 0, whasdot = 0;
|
|
ssize_t firstdot = -1, wfirstdot = -1;
|
|
|
|
INIT_COMP;
|
|
|
|
CHECK_MATCHALL(true);
|
|
|
|
/* Cannot handle word boundaries with MB string */
|
|
if (fg->word && (TRE_MB_CUR_MAX > 1))
|
|
return REG_BADPAT;
|
|
|
|
#ifdef TRE_WCHAR
|
|
SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
|
|
STORE_MBS_PAT;
|
|
#else
|
|
SAVE_PATTERN(pat, n, fg->pattern, fg->len);
|
|
#endif
|
|
|
|
DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, "
|
|
"newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
|
|
fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
|
|
|
|
FILL_QSBC;
|
|
FILL_BMGS;
|
|
#ifdef TRE_WCHAR
|
|
FILL_QSBC_WIDE;
|
|
FILL_BMGS_WIDE;
|
|
#endif
|
|
|
|
return REG_OK;
|
|
}
|
|
|
|
/*
|
|
* Returns: REG_OK on success, error code otherwise
|
|
*/
|
|
int
|
|
tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
|
|
int cflags)
|
|
{
|
|
tre_char_t *tmp;
|
|
size_t pos = 0, hasdot = 0, whasdot = 0;
|
|
ssize_t firstdot = -1, wfirstdot = -1;
|
|
bool escaped = false;
|
|
bool *_escmap = NULL;
|
|
|
|
INIT_COMP;
|
|
|
|
/* Remove beginning-of-line character ('^'). */
|
|
if (pat[0] == TRE_CHAR('^'))
|
|
{
|
|
fg->bol = true;
|
|
n--;
|
|
pat++;
|
|
}
|
|
|
|
CHECK_MATCHALL(false);
|
|
|
|
/* Handle word-boundary matching when GNU extensions are enabled */
|
|
if ((cflags & REG_GNU) && (n >= 14) &&
|
|
(memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
|
|
(memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"),
|
|
7 * sizeof(tre_char_t)) == 0))
|
|
{
|
|
n -= 14;
|
|
pat += 7;
|
|
fg->word = true;
|
|
}
|
|
|
|
/* Cannot handle word boundaries with MB string */
|
|
if (fg->word && (TRE_MB_CUR_MAX > 1))
|
|
return REG_BADPAT;
|
|
|
|
tmp = malloc((n + 1) * sizeof(tre_char_t));
|
|
if (tmp == NULL)
|
|
return REG_ESPACE;
|
|
|
|
/* Copies the char into the stored pattern and skips to the next char. */
|
|
#define STORE_CHAR \
|
|
do \
|
|
{ \
|
|
tmp[pos++] = pat[i]; \
|
|
escaped = false; \
|
|
continue; \
|
|
} while (0)
|
|
|
|
/* Traverse the input pattern for processing */
|
|
for (unsigned int i = 0; i < n; i++)
|
|
{
|
|
switch (pat[i])
|
|
{
|
|
case TRE_CHAR('\\'):
|
|
if (escaped)
|
|
STORE_CHAR;
|
|
else if (i == n - 1)
|
|
goto badpat;
|
|
else
|
|
escaped = true;
|
|
continue;
|
|
case TRE_CHAR('['):
|
|
if (escaped)
|
|
STORE_CHAR;
|
|
else
|
|
goto badpat;
|
|
continue;
|
|
case TRE_CHAR('*'):
|
|
if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
|
|
STORE_CHAR;
|
|
else
|
|
goto badpat;
|
|
continue;
|
|
case TRE_CHAR('+'):
|
|
case TRE_CHAR('?'):
|
|
if ((cflags & REG_EXTENDED) && (i == 0))
|
|
goto badpat;
|
|
else if ((cflags & REG_EXTENDED) ^ !escaped)
|
|
STORE_CHAR;
|
|
else
|
|
goto badpat;
|
|
continue;
|
|
case TRE_CHAR('.'):
|
|
if (escaped)
|
|
{
|
|
if (!_escmap)
|
|
_escmap = calloc(n, sizeof(bool));
|
|
if (!_escmap)
|
|
{
|
|
free(tmp);
|
|
return REG_ESPACE;
|
|
}
|
|
_escmap[i] = true;
|
|
STORE_CHAR;
|
|
}
|
|
else
|
|
{
|
|
whasdot = i;
|
|
if (wfirstdot == -1)
|
|
wfirstdot = i;
|
|
STORE_CHAR;
|
|
}
|
|
continue;
|
|
case TRE_CHAR('^'):
|
|
STORE_CHAR;
|
|
continue;
|
|
case TRE_CHAR('$'):
|
|
if (!escaped && (i == n - 1))
|
|
fg->eol = true;
|
|
else
|
|
STORE_CHAR;
|
|
continue;
|
|
case TRE_CHAR('('):
|
|
if ((cflags & REG_EXTENDED) ^ escaped)
|
|
goto badpat;
|
|
else
|
|
STORE_CHAR;
|
|
continue;
|
|
case TRE_CHAR('{'):
|
|
if (!(cflags & REG_EXTENDED) ^ escaped)
|
|
STORE_CHAR;
|
|
else if (!(cflags & REG_EXTENDED) && (i == 0))
|
|
STORE_CHAR;
|
|
else if ((cflags & REG_EXTENDED) && (i == 0))
|
|
continue;
|
|
else
|
|
goto badpat;
|
|
continue;
|
|
case TRE_CHAR('|'):
|
|
if ((cflags & REG_EXTENDED) ^ escaped)
|
|
goto badpat;
|
|
else
|
|
STORE_CHAR;
|
|
continue;
|
|
default:
|
|
if (escaped)
|
|
goto badpat;
|
|
else
|
|
STORE_CHAR;
|
|
continue;
|
|
}
|
|
continue;
|
|
badpat:
|
|
free(tmp);
|
|
DPRINT(("tre_compile_fast: compilation of pattern failed, falling"
|
|
"back to NFA\n"));
|
|
return REG_BADPAT;
|
|
}
|
|
|
|
fg->hasdot = wfirstdot > -1;
|
|
|
|
/*
|
|
* The pattern has been processed and copied to tmp as a literal string
|
|
* with escapes, anchors (^$) and the word boundary match character
|
|
* classes stripped out.
|
|
*/
|
|
#ifdef TRE_WCHAR
|
|
SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
|
|
fg->wescmap = _escmap;
|
|
STORE_MBS_PAT;
|
|
|
|
/*
|
|
* The position of dots and escaped dots is different in the MB string
|
|
* than in to the wide string so traverse the converted string, as well,
|
|
* to store these positions.
|
|
*/
|
|
if (fg->hasdot || (fg->wescmap != NULL))
|
|
{
|
|
if (fg->wescmap != NULL)
|
|
{
|
|
fg->escmap = calloc(fg->len, sizeof(bool));
|
|
if (fg->escmap == NULL)
|
|
{
|
|
tre_free_fast(fg);
|
|
return REG_ESPACE;
|
|
}
|
|
}
|
|
|
|
escaped = false;
|
|
char *_checkpat = NULL;
|
|
size_t _checklen = 0;
|
|
unsigned int escofs = 0;
|
|
/*
|
|
* Make a copy here of the original pattern, because fg->pattern has
|
|
* already been stripped of all escape sequences in the above processing.
|
|
* This is necessary if we wish to later treat fg->escmap as an actual,
|
|
* functional replacement of fg->wescmap.
|
|
*/
|
|
CONV_MBS_PAT(pat, _checkpat, _checklen);
|
|
for (unsigned int i = 0; i < n; i++)
|
|
if (_checkpat[i] == '\\')
|
|
{
|
|
escaped = !escaped;
|
|
if (escaped)
|
|
++escofs;
|
|
}
|
|
else if (_checkpat[i] == '.' && fg->escmap != NULL && escaped)
|
|
{
|
|
fg->escmap[i - escofs] = true;
|
|
escaped = false;
|
|
}
|
|
else if (_checkpat[i] == '.' && !escaped)
|
|
{
|
|
hasdot = i;
|
|
if (firstdot == -1)
|
|
firstdot = i;
|
|
}
|
|
else
|
|
escaped = false;
|
|
free(_checkpat);
|
|
}
|
|
#else
|
|
SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
|
|
fg->escmap = _escmap;
|
|
#endif
|
|
|
|
free(tmp);
|
|
|
|
DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, "
|
|
"icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
|
|
fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
|
|
fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
|
|
fg->newline ? 'y' : 'n'));
|
|
|
|
/* Check whether reverse QS algorithm is more efficient */
|
|
if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) &&
|
|
fg->nosub)
|
|
{
|
|
fg->reversed = true;
|
|
DPRINT(("tre_compile_fast: using reverse QS algorithm\n"));
|
|
}
|
|
|
|
FILL_QSBC;
|
|
FILL_BMGS;
|
|
#ifdef TRE_WCHAR
|
|
FILL_QSBC_WIDE;
|
|
FILL_BMGS_WIDE;
|
|
#endif
|
|
|
|
return REG_OK;
|
|
}
|
|
|
|
#define _SHIFT_ONE \
|
|
{ \
|
|
shift = 1; \
|
|
j = !fg->reversed ? j + shift : j - shift; \
|
|
continue; \
|
|
}
|
|
|
|
#define _BBOUND_COND \
|
|
((type == STR_WIDE) ? \
|
|
((j == 0) || !(tre_isalnum(str_wide[j - 1]) || \
|
|
(str_wide[j - 1] == TRE_CHAR('_')))) : \
|
|
((j == 0) || !(tre_isalnum(str_byte[j - 1]) || \
|
|
(str_byte[j - 1] == '_'))))
|
|
|
|
#define _EBOUND_COND \
|
|
((type == STR_WIDE) ? \
|
|
((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) || \
|
|
(str_wide[j + fg->wlen] == TRE_CHAR('_')))) : \
|
|
((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) || \
|
|
(str_byte[j + fg->len] == '_'))))
|
|
|
|
/*
|
|
* Condition to check whether the match on position j is on a
|
|
* word boundary.
|
|
*/
|
|
#define IS_ON_WORD_BOUNDARY \
|
|
(_BBOUND_COND && _EBOUND_COND)
|
|
|
|
/*
|
|
* Checks word boundary and shifts one if match is not on a
|
|
* boundary.
|
|
*/
|
|
#define CHECK_WORD_BOUNDARY \
|
|
if (!IS_ON_WORD_BOUNDARY) \
|
|
_SHIFT_ONE;
|
|
|
|
#define _BOL_COND \
|
|
((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\
|
|
: (str_byte[j - 1] == '\n')))
|
|
|
|
/*
|
|
* Checks BOL anchor and shifts one if match is not on a
|
|
* boundary.
|
|
*/
|
|
#define CHECK_BOL_ANCHOR \
|
|
if (!_BOL_COND) \
|
|
_SHIFT_ONE;
|
|
|
|
#define _EOL_COND \
|
|
((type == STR_WIDE) \
|
|
? ((j + fg->wlen == len) || \
|
|
(str_wide[j + fg->wlen] == TRE_CHAR('\n'))) \
|
|
: ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n')))
|
|
|
|
/*
|
|
* Checks EOL anchor and shifts one if match is not on a
|
|
* boundary.
|
|
*/
|
|
#define CHECK_EOL_ANCHOR \
|
|
if (!_EOL_COND) \
|
|
_SHIFT_ONE;
|
|
|
|
/*
|
|
* Executes matching of the precompiled pattern on the input string.
|
|
* Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
|
|
*/
|
|
int
|
|
tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
|
|
tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags)
|
|
{
|
|
unsigned int shift, u = 0, v = 0;
|
|
ssize_t j = 0;
|
|
int ret = REG_NOMATCH;
|
|
int mismatch;
|
|
const char *str_byte = data;
|
|
const void *startptr = NULL;
|
|
const tre_char_t *str_wide = data;
|
|
|
|
/* Calculate length if unspecified. */
|
|
if (len == (size_t)-1)
|
|
switch (type)
|
|
{
|
|
case STR_WIDE:
|
|
len = tre_strlen(str_wide);
|
|
break;
|
|
default:
|
|
len = strlen(str_byte);
|
|
break;
|
|
}
|
|
|
|
/* Shortcut for empty pattern */
|
|
if (fg->matchall)
|
|
{
|
|
if (!fg->nosub && nmatch >= 1)
|
|
{
|
|
pmatch[0].rm_so = 0;
|
|
pmatch[0].rm_eo = len;
|
|
}
|
|
if (fg->bol && fg->eol)
|
|
return (len == 0) ? REG_OK : REG_NOMATCH;
|
|
else
|
|
return REG_OK;
|
|
}
|
|
|
|
/* No point in going farther if we do not have enough data. */
|
|
switch (type)
|
|
{
|
|
case STR_WIDE:
|
|
if (len < fg->wlen)
|
|
return ret;
|
|
shift = fg->wlen;
|
|
break;
|
|
default:
|
|
if (len < fg->len)
|
|
return ret;
|
|
shift = fg->len;
|
|
}
|
|
|
|
/*
|
|
* REG_NOTBOL means not anchoring ^ to the beginning of the line, so we
|
|
* can shift one because there can't be a match at the beginning.
|
|
*/
|
|
if (fg->bol && (eflags & REG_NOTBOL))
|
|
j = 1;
|
|
|
|
/*
|
|
* Like above, we cannot have a match at the very end when anchoring to
|
|
* the end and REG_NOTEOL is specified.
|
|
*/
|
|
if (fg->eol && (eflags & REG_NOTEOL))
|
|
len--;
|
|
|
|
if (fg->reversed)
|
|
j = len - (type == STR_WIDE ? fg->wlen : fg->len);
|
|
|
|
|
|
/* Only try once at the beginning or ending of the line. */
|
|
if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) &&
|
|
!(eflags & REG_NOTEOL))
|
|
{
|
|
/* Simple text comparison. */
|
|
if (!((fg->bol && fg->eol) &&
|
|
(type == STR_WIDE ? (len != fg->wlen) : (len != fg->len))))
|
|
{
|
|
/* Determine where in data to start search at. */
|
|
j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0;
|
|
SKIP_CHARS(j);
|
|
mismatch = fastcmp(fg, startptr, type);
|
|
if (mismatch == REG_OK)
|
|
{
|
|
if (fg->word && !IS_ON_WORD_BOUNDARY)
|
|
return ret;
|
|
if (!fg->nosub && nmatch >= 1)
|
|
{
|
|
pmatch[0].rm_so = j;
|
|
pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len);
|
|
}
|
|
return REG_OK;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
/* Quick Search / Turbo Boyer-Moore algorithm. */
|
|
do
|
|
{
|
|
SKIP_CHARS(j);
|
|
mismatch = fastcmp(fg, startptr, type);
|
|
if (mismatch == REG_OK)
|
|
{
|
|
if (fg->word)
|
|
CHECK_WORD_BOUNDARY;
|
|
if (fg->bol)
|
|
CHECK_BOL_ANCHOR;
|
|
if (fg->eol)
|
|
CHECK_EOL_ANCHOR;
|
|
if (!fg->nosub && nmatch >= 1)
|
|
{
|
|
pmatch[0].rm_so = j;
|
|
pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
|
|
}
|
|
return REG_OK;
|
|
}
|
|
else if (mismatch > 0)
|
|
return mismatch;
|
|
mismatch = -mismatch - 1;
|
|
SHIFT;
|
|
} while (!IS_OUT_OF_BOUNDS);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* Frees the resources that were allocated when the pattern was compiled.
|
|
*/
|
|
void
|
|
tre_free_fast(fastmatch_t *fg)
|
|
{
|
|
|
|
DPRINT(("tre_fast_free: freeing structures for pattern %s\n",
|
|
fg->pattern));
|
|
|
|
#ifdef TRE_WCHAR
|
|
hashtable_free(fg->qsBc_table);
|
|
if (!fg->hasdot)
|
|
free(fg->bmGs);
|
|
if (fg->wescmap)
|
|
free(fg->wescmap);
|
|
free(fg->wpattern);
|
|
#endif
|
|
if (!fg->hasdot)
|
|
free(fg->sbmGs);
|
|
if (fg->escmap)
|
|
free(fg->escmap);
|
|
free(fg->pattern);
|
|
}
|
|
|
|
/*
|
|
* Returns: -(i + 1) on failure (position that it failed with minus sign)
|
|
* error code on error
|
|
* REG_OK on success
|
|
*/
|
|
static inline int
|
|
fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type)
|
|
{
|
|
const char *str_byte = data;
|
|
const char *pat_byte = fg->pattern;
|
|
const tre_char_t *str_wide = data;
|
|
const tre_char_t *pat_wide = fg->wpattern;
|
|
const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
|
|
size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
|
|
int ret = REG_OK;
|
|
|
|
/* Compare the pattern and the input char-by-char from the last position. */
|
|
for (int i = len - 1; i >= 0; i--) {
|
|
switch (type)
|
|
{
|
|
case STR_WIDE:
|
|
|
|
/* Check dot */
|
|
if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
|
|
(!escmap || !escmap[i]) &&
|
|
(!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
|
|
continue;
|
|
|
|
/* Compare */
|
|
if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
|
|
: (pat_wide[i] == str_wide[i]))
|
|
continue;
|
|
break;
|
|
default:
|
|
/* Check dot */
|
|
if (fg->hasdot && pat_byte[i] == '.' &&
|
|
(!escmap || !escmap[i]) &&
|
|
(!fg->newline || (str_byte[i] != '\n')))
|
|
continue;
|
|
|
|
/* Compare */
|
|
if (fg->icase ? (tolower((unsigned char)pat_byte[i]) == tolower((unsigned char)str_byte[i]))
|
|
: (pat_byte[i] == str_byte[i]))
|
|
continue;
|
|
}
|
|
DPRINT(("fastcmp: mismatch at position %d\n", i));
|
|
ret = -(i + 1);
|
|
break;
|
|
}
|
|
return ret;
|
|
}
|