From a2584d1b34c54485330d2bba1983b708e52dc6d4 Mon Sep 17 00:00:00 2001 From: Kyle Evans Date: Fri, 4 May 2018 03:13:25 +0000 Subject: [PATCH] bsdgrep: annihilate our in-tree TRE, previously disabled by default It was an old TRE that had plenty of bugs and no performance gain over regex(3). I disabled it by default in r323615, and there was some confusion about what the knob does- likely due to poor naming on my part- to the tune of "well, it sounds like it should speed things up" (mentioned by multiple people). To compound this, I have no intention of maintaining a second regex implementation. If someone would like to step up and volunteer to maintain a lean-and-mean implementation for grep, this is OK, but we have very few volunteers to maintain even our primary regex implementation. --- share/mk/bsd.prog.mk | 1 + share/mk/src.opts.mk | 1 - .../build/options/WITHOUT_BSD_GREP_FASTMATCH | 5 - tools/build/options/WITH_BSD_GREP_FASTMATCH | 3 - usr.bin/grep/Makefile | 9 - usr.bin/grep/grep.c | 18 - usr.bin/grep/grep.h | 7 - usr.bin/grep/regex/fastmatch.c | 170 --- usr.bin/grep/regex/fastmatch.h | 95 -- usr.bin/grep/regex/glue.h | 67 -- usr.bin/grep/regex/hashtable.c | 270 ----- usr.bin/grep/regex/hashtable.h | 35 - usr.bin/grep/regex/tre-compile.c | 101 -- usr.bin/grep/regex/tre-fastmatch.c | 1070 ----------------- usr.bin/grep/regex/tre-fastmatch.h | 21 - usr.bin/grep/util.c | 17 +- 16 files changed, 3 insertions(+), 1887 deletions(-) delete mode 100644 tools/build/options/WITHOUT_BSD_GREP_FASTMATCH delete mode 100644 tools/build/options/WITH_BSD_GREP_FASTMATCH delete mode 100644 usr.bin/grep/regex/fastmatch.c delete mode 100644 usr.bin/grep/regex/fastmatch.h delete mode 100644 usr.bin/grep/regex/glue.h delete mode 100644 usr.bin/grep/regex/hashtable.c delete mode 100644 usr.bin/grep/regex/hashtable.h delete mode 100644 usr.bin/grep/regex/tre-compile.c delete mode 100644 usr.bin/grep/regex/tre-fastmatch.c delete mode 100644 usr.bin/grep/regex/tre-fastmatch.h diff --git a/share/mk/bsd.prog.mk b/share/mk/bsd.prog.mk index c88742aff1e7..c4ca61677528 100644 --- a/share/mk/bsd.prog.mk +++ b/share/mk/bsd.prog.mk @@ -271,6 +271,7 @@ SCRIPTSGRP_${script:T}?= ${SCRIPTSGRP} SCRIPTSMODE_${script:T}?= ${SCRIPTSMODE} STAGE_AS_${script:T}= ${SCRIPTSDIR_${script:T}}/${SCRIPTSNAME_${script:T}} _scriptsinstall: _SCRIPTSINS_${script:T} + echo ">SFD>F>DF YES" _SCRIPTSINS_${script:T}: ${script} ${INSTALL} ${TAG_ARGS} -o ${SCRIPTSOWN_${.ALLSRC:T}} \ -g ${SCRIPTSGRP_${.ALLSRC:T}} -m ${SCRIPTSMODE_${.ALLSRC:T}} \ diff --git a/share/mk/src.opts.mk b/share/mk/src.opts.mk index 9e37ce0ee5fc..8638098acf79 100644 --- a/share/mk/src.opts.mk +++ b/share/mk/src.opts.mk @@ -187,7 +187,6 @@ __DEFAULT_YES_OPTIONS = \ __DEFAULT_NO_OPTIONS = \ BSD_GREP \ - BSD_GREP_FASTMATCH \ CLANG_EXTRAS \ DTRACE_TESTS \ GNU_GREP_COMPAT \ diff --git a/tools/build/options/WITHOUT_BSD_GREP_FASTMATCH b/tools/build/options/WITHOUT_BSD_GREP_FASTMATCH deleted file mode 100644 index ffb3aafe3641..000000000000 --- a/tools/build/options/WITHOUT_BSD_GREP_FASTMATCH +++ /dev/null @@ -1,5 +0,0 @@ -.\" $FreeBSD$ -Set this option to exclude the fastmatch implementation from -.Xr bsdgrep 1 , -instead using only -.Xr regex 3 . diff --git a/tools/build/options/WITH_BSD_GREP_FASTMATCH b/tools/build/options/WITH_BSD_GREP_FASTMATCH deleted file mode 100644 index 07af5f59d243..000000000000 --- a/tools/build/options/WITH_BSD_GREP_FASTMATCH +++ /dev/null @@ -1,3 +0,0 @@ -.\" $FreeBSD$ -Set this option to use the fastmatch implementation in -.Xr bsdgrep 1 . diff --git a/usr.bin/grep/Makefile b/usr.bin/grep/Makefile index 9fd5d55bf4ea..9eec5877e348 100644 --- a/usr.bin/grep/Makefile +++ b/usr.bin/grep/Makefile @@ -17,15 +17,6 @@ bsdgrep.1: grep.1 .endif SRCS= file.c grep.c queue.c util.c -.if ${MK_BSD_GREP_FASTMATCH} == "yes" -# Extra files ported backported for some regex improvements -.PATH: ${.CURDIR}/regex -SRCS+= fastmatch.c hashtable.c tre-compile.c tre-fastmatch.c -CFLAGS+=-I${.CURDIR}/regex -.else -CFLAGS+= -DWITHOUT_FASTMATCH -.endif - SCRIPTS= zgrep.sh LINKS= ${BINDIR}/zgrep ${BINDIR}/zfgrep \ ${BINDIR}/zgrep ${BINDIR}/zegrep \ diff --git a/usr.bin/grep/grep.c b/usr.bin/grep/grep.c index 7d9dd2481fd8..827c6b987a0f 100644 --- a/usr.bin/grep/grep.c +++ b/usr.bin/grep/grep.c @@ -51,9 +51,6 @@ __FBSDID("$FreeBSD$"); #include #include -#ifndef WITHOUT_FASTMATCH -#include "fastmatch.h" -#endif #include "grep.h" #ifndef WITHOUT_NLS @@ -96,9 +93,6 @@ unsigned int patterns; static unsigned int pattern_sz; struct pat *pattern; regex_t *r_pattern; -#ifndef WITHOUT_FASTMATCH -fastmatch_t *fg_pattern; -#endif /* Filename exclusion/inclusion patterns */ unsigned int fpatterns, dpatterns; @@ -712,9 +706,6 @@ main(int argc, char *argv[]) usage(); } -#ifndef WITHOUT_FASTMATCH - fg_pattern = grep_calloc(patterns, sizeof(*fg_pattern)); -#endif r_pattern = grep_calloc(patterns, sizeof(*r_pattern)); /* Don't process any patterns if we have a blank one */ @@ -725,15 +716,6 @@ main(int argc, char *argv[]) #endif /* Check if cheating is allowed (always is for fgrep). */ for (i = 0; i < patterns; ++i) { -#ifndef WITHOUT_FASTMATCH - /* - * Attempt compilation with fastmatch regex and - * fallback to regex(3) if it fails. - */ - if (fastncomp(&fg_pattern[i], pattern[i].pat, - pattern[i].len, cflags) == 0) - continue; -#endif c = regcomp(&r_pattern[i], pattern[i].pat, cflags); if (c != 0) { regerror(c, &r_pattern[i], re_error, diff --git a/usr.bin/grep/grep.h b/usr.bin/grep/grep.h index 5ddd4d01b9e9..41a8a812309f 100644 --- a/usr.bin/grep/grep.h +++ b/usr.bin/grep/grep.h @@ -38,10 +38,6 @@ #include #include -#ifndef WITHOUT_FASTMATCH -#include "fastmatch.h" -#endif - #ifdef WITHOUT_NLS #define getstr(n) errstr[n] #else @@ -131,9 +127,6 @@ extern unsigned int dpatterns, fpatterns, patterns; extern struct pat *pattern; extern struct epat *dpattern, *fpattern; extern regex_t *er_pattern, *r_pattern; -#ifndef WITHOUT_FASTMATCH -extern fastmatch_t *fg_pattern; -#endif /* For regex errors */ #define RE_ERROR_BUF 512 diff --git a/usr.bin/grep/regex/fastmatch.c b/usr.bin/grep/regex/fastmatch.c deleted file mode 100644 index 6e3e3f8055a9..000000000000 --- a/usr.bin/grep/regex/fastmatch.c +++ /dev/null @@ -1,170 +0,0 @@ -/* $FreeBSD$ */ - -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (C) 2011 Gabor Kovesdan - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "glue.h" - -#include -#include -#include -#include - -#include "tre-fastmatch.h" - -int -tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags) -{ - int ret; - tre_char_t *wregex; - size_t wlen; - - if (n != 0) - { - ret = tre_convert_pattern(regex, n, &wregex, &wlen); - if (ret != REG_OK) - return ret; - else - ret = tre_compile_literal(preg, wregex, wlen, cflags); - tre_free_pattern(wregex); - return ret; - } - else - return tre_compile_literal(preg, NULL, 0, cflags); -} - -int -tre_fastncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags) -{ - int ret; - tre_char_t *wregex; - size_t wlen; - - if (n != 0) - { - ret = tre_convert_pattern(regex, n, &wregex, &wlen); - if (ret != REG_OK) - return ret; - else - ret = (cflags & REG_LITERAL) - ? tre_compile_literal(preg, wregex, wlen, cflags) - : tre_compile_fast(preg, wregex, wlen, cflags); - tre_free_pattern(wregex); - return ret; - } - else - return tre_compile_literal(preg, NULL, 0, cflags); -} - - -int -tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags) -{ - return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags); -} - -int -tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags) -{ - return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags); -} - -int -tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags) -{ - return tre_compile_literal(preg, regex, n, cflags); -} - -int -tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags) -{ - return (cflags & REG_LITERAL) ? - tre_compile_literal(preg, regex, n, cflags) : - tre_compile_fast(preg, regex, n, cflags); -} - -int -tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags) -{ - return tre_fixwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags); -} - -int -tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags) -{ - return tre_fastwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags); -} - -void -tre_fastfree(fastmatch_t *preg) -{ - tre_free_fast(preg); -} - -int -tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len, - size_t nmatch, regmatch_t pmatch[], int eflags) -{ - tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; - - if (eflags & REG_STARTEND) - CALL_WITH_OFFSET(tre_match_fast(preg, &string[offset], slen, - type, nmatch, pmatch, eflags)); - else - return tre_match_fast(preg, string, len, type, nmatch, - pmatch, eflags); -} - -int -tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch, - regmatch_t pmatch[], int eflags) -{ - return tre_fastnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags); -} - -int -tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len, - size_t nmatch, regmatch_t pmatch[], int eflags) -{ - tre_str_type_t type = STR_WIDE; - - if (eflags & REG_STARTEND) - CALL_WITH_OFFSET(tre_match_fast(preg, &string[offset], slen, - type, nmatch, pmatch, eflags)); - else - return tre_match_fast(preg, string, len, type, nmatch, - pmatch, eflags); -} - -int -tre_fastwexec(const fastmatch_t *preg, const wchar_t *string, - size_t nmatch, regmatch_t pmatch[], int eflags) -{ - return tre_fastwnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags); -} - diff --git a/usr.bin/grep/regex/fastmatch.h b/usr.bin/grep/regex/fastmatch.h deleted file mode 100644 index 15178a02fa68..000000000000 --- a/usr.bin/grep/regex/fastmatch.h +++ /dev/null @@ -1,95 +0,0 @@ -/* $FreeBSD$ */ - -#ifndef FASTMATCH_H -#define FASTMATCH_H 1 - -#include -#include -#include -#include - -typedef struct { - size_t wlen; - size_t len; - wchar_t *wpattern; - bool *wescmap; - unsigned int qsBc[UCHAR_MAX + 1]; - unsigned int *bmGs; - char *pattern; - bool *escmap; - unsigned int defBc; - void *qsBc_table; - unsigned int *sbmGs; - const char *re_endp; - - /* flags */ - bool hasdot; - bool bol; - bool eol; - bool word; - bool icase; - bool newline; - bool nosub; - bool matchall; - bool reversed; -} fastmatch_t; - -extern int -tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags); - -extern int -tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags); - -extern int -tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch, - regmatch_t pmatch[], int eflags); - -extern void -tre_fastfree(fastmatch_t *preg); - -extern int -tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags); - -extern int -tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags); - -extern int -tre_fastwexec(const fastmatch_t *preg, const wchar_t *string, - size_t nmatch, regmatch_t pmatch[], int eflags); - -/* Versions with a maximum length argument and therefore the capability to - handle null characters in the middle of the strings. */ -extern int -tre_fixncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags); - -extern int -tre_fastncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags); - -extern int -tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len, - size_t nmatch, regmatch_t pmatch[], int eflags); - -extern int -tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags); - -extern int -tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags); - -extern int -tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len, - size_t nmatch, regmatch_t pmatch[], int eflags); - -#define fixncomp tre_fixncomp -#define fastncomp tre_fastncomp -#define fixcomp tre_fixcomp -#define fastcomp tre_fastcomp -#define fixwncomp tre_fixwncomp -#define fastwncomp tre_fastwncomp -#define fixwcomp tre_fixwcomp -#define fastwcomp tre_fastwcomp -#define fastfree tre_fastfree -#define fastnexec tre_fastnexec -#define fastexec tre_fastexec -#define fastwnexec tre_fastwnexec -#define fastwexec tre_fastwexec -#endif /* FASTMATCH_H */ diff --git a/usr.bin/grep/regex/glue.h b/usr.bin/grep/regex/glue.h deleted file mode 100644 index 0c54e98bd79b..000000000000 --- a/usr.bin/grep/regex/glue.h +++ /dev/null @@ -1,67 +0,0 @@ -/* $FreeBSD$ */ - -#ifndef GLUE_H -#define GLUE_H - -#include -#undef RE_DUP_MAX -#include -#include -#include - -#define TRE_WCHAR 1 -#define TRE_MULTIBYTE 1 -#define HAVE_MBSTATE_T 1 - -#define TRE_CHAR(n) L##n -#define CHF "%lc" - -#define tre_char_t wchar_t -#define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps))) -#define tre_strlen wcslen -#define tre_isspace iswspace -#define tre_isalnum iswalnum - -#define REG_OK 0 -#define REG_LITERAL 0020 -#define REG_WORD 0100 -#define REG_GNU 0400 - -#define TRE_MB_CUR_MAX MB_CUR_MAX - -#ifndef _GREP_DEBUG -#define DPRINT(msg) -#else -#define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/0) -#endif - -#define MIN(a,b) ((a > b) ? (b) : (a)) -#define MAX(a,b) ((a > b) ? (a) : (b)) - -typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t; - -#define CALL_WITH_OFFSET(fn) \ - do \ - { \ - size_t slen = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so); \ - size_t offset = pmatch[0].rm_so; \ - int ret; \ - \ - if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \ - return REG_NOMATCH; \ - ret = fn; \ - for (unsigned i = 0; (!preg->nosub && (i < nmatch)); i++) \ - { \ - pmatch[i].rm_so += offset; \ - pmatch[i].rm_eo += offset; \ - } \ - return ret; \ - } while (0 /*CONSTCOND*/) - -int -tre_convert_pattern(const char *regex, size_t n, tre_char_t **w, - size_t *wn); - -void -tre_free_pattern(tre_char_t *wregex); -#endif diff --git a/usr.bin/grep/regex/hashtable.c b/usr.bin/grep/regex/hashtable.c deleted file mode 100644 index 1767d0a16d66..000000000000 --- a/usr.bin/grep/regex/hashtable.c +++ /dev/null @@ -1,270 +0,0 @@ -/* $FreeBSD$ */ - -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (C) 2011 Gabor Kovesdan - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "glue.h" - -#include -#include -#include -#include - -#include "hashtable.h" - - -/* - * Return a 32-bit hash of the given buffer. The init - * value should be 0, or the previous hash value to extend - * the previous hash. - */ -static uint32_t -hash32_buf(const void *buf, size_t len, uint32_t hash) -{ - const unsigned char *p = buf; - - while (len--) - hash = HASHSTEP(hash, *p++); - - return hash; -} - -/* - * Initializes a hash table that can hold table_size number of entries, - * each of which has a key of key_size bytes and a value of value_size - * bytes. On successful allocation returns a pointer to the hash table. - * Otherwise, returns NULL and sets errno to indicate the error. - */ -hashtable -*hashtable_init(size_t table_size, size_t key_size, size_t value_size) -{ - hashtable *tbl; - - DPRINT(("hashtable_init: table_size %zu, key_size %zu, value_size %zu\n", - table_size, key_size, value_size)); - - tbl = malloc(sizeof(hashtable)); - if (tbl == NULL) - goto mem1; - - tbl->entries = calloc(sizeof(hashtable_entry *), table_size); - if (tbl->entries == NULL) - goto mem2; - - tbl->table_size = table_size; - tbl->usage = 0; - tbl->key_size = key_size; - tbl->value_size = value_size; - - return (tbl); - -mem2: - free(tbl); -mem1: - DPRINT(("hashtable_init: allocation failed\n")); - errno = ENOMEM; - return (NULL); -} - -/* - * Places the key-value pair to the hashtable tbl. - * Returns: - * HASH_OK: if the key was not present in the hash table yet - * but the kay-value pair has been successfully added. - * HASH_UPDATED: if the value for the key has been updated with the - * new value. - * HASH_FULL: if the hash table is full and the entry could not - * be added. - * HASH_FAIL: if an error has occurred and errno has been set to - * indicate the error. - */ -int -hashtable_put(hashtable *tbl, const void *key, const void *value) -{ - uint32_t hash = 0; - - if (tbl->table_size == tbl->usage) - { - DPRINT(("hashtable_put: hashtable is full\n")); - return (HASH_FULL); - } - - hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size; - DPRINT(("hashtable_put: calculated hash %" PRIu32 "\n", hash)); - - /* - * On hash collision entries are inserted at the next free space, - * so we have to increase the index until we either find an entry - * with the same key (and update it) or we find a free space. - */ - for(;;) - { - if (tbl->entries[hash] == NULL) - break; - else if (memcmp(tbl->entries[hash]->key, key, tbl->key_size) == 0) - { - memcpy(tbl->entries[hash]->value, value, tbl->value_size); - DPRINT(("hashtable_put: effective location is %" PRIu32 - ", entry updated\n", hash)); - return (HASH_UPDATED); - } - if (++hash == tbl->table_size) - hash = 0; - } - - DPRINT(("hashtable_put: effective location is %" PRIu32 "\n", hash)); - - tbl->entries[hash] = malloc(sizeof(hashtable_entry)); - if (tbl->entries[hash] == NULL) - { - errno = ENOMEM; - goto mem1; - } - - tbl->entries[hash]->key = malloc(tbl->key_size); - if (tbl->entries[hash]->key == NULL) - { - errno = ENOMEM; - goto mem2; - } - - tbl->entries[hash]->value = malloc(tbl->value_size); - if (tbl->entries[hash]->value == NULL) - { - errno = ENOMEM; - goto mem3; - } - - memcpy(tbl->entries[hash]->key, key, tbl->key_size); - memcpy(tbl->entries[hash]->value, value, tbl->value_size); - tbl->usage++; - - DPRINT(("hashtable_put: entry successfully inserted\n")); - - return (HASH_OK); - -mem3: - free(tbl->entries[hash]->key); -mem2: - free(tbl->entries[hash]); -mem1: - DPRINT(("hashtable_put: insertion failed\n")); - return (HASH_FAIL); -} - -static hashtable_entry -**hashtable_lookup(const hashtable *tbl, const void *key) -{ - uint32_t hash = 0; - - hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size; - - for (;;) - { - if (tbl->entries[hash] == NULL) - return (NULL); - else if (memcmp(key, tbl->entries[hash]->key, tbl->key_size) == 0) - { - DPRINT(("hashtable_lookup: entry found at location %" PRIu32 "\n", hash)); - return (&tbl->entries[hash]); - } - - if (++hash == tbl->table_size) - hash = 0; - } -} - -/* - * Retrieves the value for key from the hash table tbl and places - * it to the space indicated by the value argument. - * Returns HASH_OK if the value has been found and retrieved or - * HASH_NOTFOUND otherwise. - */ -int -hashtable_get(hashtable *tbl, const void *key, void *value) -{ - hashtable_entry **entry; - - entry = hashtable_lookup(tbl, key); - if (entry == NULL) - { - DPRINT(("hashtable_get: entry is not available in the hashtable\n")); - return (HASH_NOTFOUND); - } - - memcpy(value, (*entry)->value, tbl->value_size); - DPRINT(("hashtable_get: entry successfully copied into output buffer\n")); - return (HASH_OK); -} - -/* - * Removes the entry with the specifified key from the hash table - * tbl. Returns HASH_OK if the entry has been found and removed - * or HASH_NOTFOUND otherwise. - */ -int -hashtable_remove(hashtable *tbl, const void *key) -{ - hashtable_entry **entry; - - entry = hashtable_lookup(tbl, key); - if (entry == NULL) - { - DPRINT(("hashtable_remove: entry is not available in the hashtable\n")); - return (HASH_NOTFOUND); - } - - free((*entry)->key); - free((*entry)->value); - free(*entry); - *entry = NULL; - - tbl->usage--; - DPRINT(("hashtable_remove: entry successfully removed\n")); - return (HASH_OK); -} - -/* - * Frees the resources associated with the hash table tbl. - */ -void -hashtable_free(hashtable *tbl) -{ - if (tbl == NULL) - return; - - for (unsigned int i = 0; i < tbl->table_size; i++) - if ((tbl->entries[i] != NULL)) - { - free(tbl->entries[i]->key); - free(tbl->entries[i]->value); - } - - free(tbl->entries); - DPRINT(("hashtable_free: resources are successfully freed\n")); -} diff --git a/usr.bin/grep/regex/hashtable.h b/usr.bin/grep/regex/hashtable.h deleted file mode 100644 index f8b1daa510b6..000000000000 --- a/usr.bin/grep/regex/hashtable.h +++ /dev/null @@ -1,35 +0,0 @@ -/* $FreeBSD$ */ - -#ifndef HASHTABLE_H -#define HASHTABLE_H 1 - -#include - -#define HASH_OK 0 -#define HASH_UPDATED 1 -#define HASH_FAIL 2 -#define HASH_FULL 3 -#define HASH_NOTFOUND 4 - -#define HASHSTEP(x,c) (((x << 5) + x) + (c)) - -typedef struct { - void *key; - void *value; -} hashtable_entry; - -typedef struct { - size_t key_size; - size_t table_size; - size_t usage; - size_t value_size; - hashtable_entry **entries; -} hashtable; - -void hashtable_free(hashtable *); -int hashtable_get(hashtable *, const void *, void *); -hashtable *hashtable_init(size_t, size_t, size_t); -int hashtable_put(hashtable *, const void *, const void *); -int hashtable_remove(hashtable *, const void *); - -#endif /* HASHTABLE.H */ diff --git a/usr.bin/grep/regex/tre-compile.c b/usr.bin/grep/regex/tre-compile.c deleted file mode 100644 index dbe315382aba..000000000000 --- a/usr.bin/grep/regex/tre-compile.c +++ /dev/null @@ -1,101 +0,0 @@ -/* $FreeBSD$ */ - -#include "glue.h" - -#include -#include -#include -#include -#include -#include - -int -tre_convert_pattern(const char *regex, size_t n, tre_char_t **w, - size_t *wn) -{ -#if TRE_WCHAR - tre_char_t *wregex; - size_t wlen; - - wregex = malloc(sizeof(tre_char_t) * (n + 1)); - if (wregex == NULL) - return REG_ESPACE; - - /* If the current locale uses the standard single byte encoding of - characters, we don't do a multibyte string conversion. If we did, - many applications which use the default locale would break since - the default "C" locale uses the 7-bit ASCII character set, and - all characters with the eighth bit set would be considered invalid. */ -#if TRE_MULTIBYTE - if (TRE_MB_CUR_MAX == 1) -#endif /* TRE_MULTIBYTE */ - { - unsigned int i; - const unsigned char *str = (const unsigned char *)regex; - tre_char_t *wstr = wregex; - - for (i = 0; i < n; i++) - *(wstr++) = *(str++); - wlen = n; - } -#if TRE_MULTIBYTE - else - { - int consumed; - tre_char_t *wcptr = wregex; -#ifdef HAVE_MBSTATE_T - mbstate_t state; - memset(&state, '\0', sizeof(state)); -#endif /* HAVE_MBSTATE_T */ - while (n > 0) - { - consumed = tre_mbrtowc(wcptr, regex, n, &state); - - switch (consumed) - { - case 0: - if (*regex == '\0') - consumed = 1; - else - { - free(wregex); - return REG_BADPAT; - } - break; - case -1: - DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno))); - free(wregex); - return REG_BADPAT; - case -2: - /* The last character wasn't complete. Let's not call it a - fatal error. */ - consumed = n; - break; - } - regex += consumed; - n -= consumed; - wcptr++; - } - wlen = wcptr - wregex; - } -#endif /* TRE_MULTIBYTE */ - wregex[wlen] = L'\0'; - *w = wregex; - *wn = wlen; - return REG_OK; -#else /* !TRE_WCHAR */ - { - *w = (tre_char_t * const *)regex; - *wn = n; - return REG_OK; - } -#endif /* !TRE_WCHAR */ -} - -void -tre_free_pattern(tre_char_t *wregex) -{ -#if TRE_WCHAR - free(wregex); -#endif -} diff --git a/usr.bin/grep/regex/tre-fastmatch.c b/usr.bin/grep/regex/tre-fastmatch.c deleted file mode 100644 index c28645602fa4..000000000000 --- a/usr.bin/grep/regex/tre-fastmatch.c +++ /dev/null @@ -1,1070 +0,0 @@ -/* $FreeBSD$ */ - -/*- - * SPDX-License-Identifier: BSD-2-Clause-FreeBSD - * - * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav - * Copyright (C) 2008-2011 Gabor Kovesdan - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "glue.h" - -#include -#include -#include -#include -#include -#include -#ifdef TRE_WCHAR -#include -#include -#endif - -#include "hashtable.h" -#include "tre-fastmatch.h" - -static int fastcmp(const fastmatch_t *fg, const void *data, - tre_str_type_t type); - -/* - * Clean up if pattern compilation fails. - */ -#define FAIL_COMP(errcode) \ - { \ - if (fg->pattern) \ - free(fg->pattern); \ - if (fg->wpattern) \ - free(fg->wpattern); \ - if (fg->qsBc_table) \ - hashtable_free(fg->qsBc_table); \ - fg = NULL; \ - return errcode; \ - } - -/* - * Skips n characters in the input string and assigns the start - * address to startptr. Note: as per IEEE Std 1003.1-2008 - * matching is based on bit pattern not character representations - * so we can handle MB strings as byte sequences just like - * SB strings. - */ -#define SKIP_CHARS(n) \ - switch (type) \ - { \ - case STR_WIDE: \ - startptr = str_wide + n; \ - break; \ - default: \ - startptr = str_byte + n; \ - } - -/* - * Converts the wide string pattern to SB/MB string and stores - * it in fg->pattern. Sets fg->len to the byte length of the - * converted string. - */ -#define STORE_MBS_PAT \ - { \ - size_t siz; \ - \ - siz = wcstombs(NULL, fg->wpattern, 0); \ - if (siz == (size_t)-1) \ - return REG_BADPAT; \ - fg->len = siz; \ - fg->pattern = malloc(siz + 1); \ - if (fg->pattern == NULL) \ - return REG_ESPACE; \ - wcstombs(fg->pattern, fg->wpattern, siz); \ - fg->pattern[siz] = '\0'; \ - } \ - -#define CONV_MBS_PAT(src, dest, destsz) \ - { \ - destsz = wcstombs(NULL, src, 0); \ - if (destsz == (size_t)-1) \ - return REG_BADPAT; \ - dest = malloc(destsz + 1); \ - if (dest == NULL) \ - return REG_ESPACE; \ - wcstombs(dest, src, destsz); \ - dest[destsz] = '\0'; \ - } \ - -#define IS_OUT_OF_BOUNDS \ - ((!fg->reversed \ - ? ((type == STR_WIDE) ? ((j + fg->wlen) > len) \ - : ((j + fg->len) > len)) \ - : (j < 0))) - -/* - * Checks whether the new position after shifting in the input string - * is out of the bounds and break out from the loop if so. - */ -#define CHECKBOUNDS \ - if (IS_OUT_OF_BOUNDS) \ - break; \ - -/* - * Shifts in the input string after a mismatch. The position of the - * mismatch is stored in the mismatch variable. - */ -#define SHIFT \ - CHECKBOUNDS; \ - \ - { \ - int r = -1; \ - unsigned int bc = 0, gs = 0, ts; \ - \ - switch (type) \ - { \ - case STR_WIDE: \ - if (!fg->hasdot) \ - { \ - if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift) \ - mismatch -= u; \ - v = fg->wlen - 1 - mismatch; \ - r = hashtable_get(fg->qsBc_table, \ - &str_wide[!fg->reversed ? (size_t)j + fg->wlen \ - : (size_t)j - 1], &bc); \ - gs = fg->bmGs[mismatch]; \ - } \ - bc = (r == HASH_OK) ? bc : fg->defBc; \ - DPRINT(("tre_fast_match: mismatch on character" CHF ", " \ - "BC %d, GS %d\n", \ - ((const tre_char_t *)startptr)[mismatch + 1], \ - bc, gs)); \ - break; \ - default: \ - if (!fg->hasdot) \ - { \ - if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift) \ - mismatch -= u; \ - v = fg->len - 1 - mismatch; \ - gs = fg->sbmGs[mismatch]; \ - } \ - bc = fg->qsBc[((const unsigned char *)str_byte) \ - [!fg->reversed ? (size_t)j + fg->len \ - : (size_t)j - 1]]; \ - DPRINT(("tre_fast_match: mismatch on character %c, " \ - "BC %d, GS %d\n", \ - ((const unsigned char *)startptr)[mismatch + 1], \ - bc, gs)); \ - } \ - if (fg->hasdot) \ - shift = bc; \ - else \ - { \ - ts = (u >= v) ? (u - v) : 0; \ - shift = MAX(ts, bc); \ - shift = MAX(shift, gs); \ - if (shift == gs) \ - u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v); \ - else \ - { \ - if (ts < bc) \ - shift = MAX(shift, u + 1); \ - u = 0; \ - } \ - } \ - DPRINT(("tre_fast_match: shifting %u characters\n", shift)); \ - j = !fg->reversed ? j + shift : j - shift; \ - } - -/* - * Normal Quick Search would require a shift based on the position the - * next character after the comparison is within the pattern. With - * wildcards, the position of the last dot effects the maximum shift - * distance. - * The closer to the end the wild card is the slower the search. - * - * Examples: - * Pattern Max shift - * ------- --------- - * this 5 - * .his 4 - * t.is 3 - * th.s 2 - * thi. 1 - */ - -/* - * Fills in the bad character shift array for SB/MB strings. - */ -#define FILL_QSBC \ - if (fg->reversed) \ - { \ - _FILL_QSBC_REVERSED \ - } \ - else \ - { \ - _FILL_QSBC \ - } - -#define _FILL_QSBC \ - for (unsigned int i = 0; i <= UCHAR_MAX; i++) \ - fg->qsBc[i] = fg->len - hasdot; \ - for (unsigned int i = hasdot + 1; i < fg->len; i++) \ - { \ - fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i; \ - DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i], \ - fg->len - i)); \ - if (fg->icase) \ - { \ - char c = islower((unsigned char)fg->pattern[i]) ? \ - toupper((unsigned char)fg->pattern[i]) : \ - tolower((unsigned char)fg->pattern[i]); \ - fg->qsBc[(unsigned char)c] = fg->len - i; \ - DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i)); \ - } \ - } - -#define _FILL_QSBC_REVERSED \ - for (unsigned int i = 0; i <= UCHAR_MAX; i++) \ - fg->qsBc[i] = firstdot + 1; \ - for (int i = firstdot - 1; i >= 0; i--) \ - { \ - fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1; \ - DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i], \ - i + 1)); \ - if (fg->icase) \ - { \ - char c = islower((unsigned char)fg->pattern[i]) ? \ - toupper((unsigned char)fg->pattern[i]) : \ - tolower((unsigned char)fg->pattern[i]); \ - fg->qsBc[(unsigned char)c] = i + 1; \ - DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1)); \ - } \ - } - -/* - * Fills in the bad character shifts into a hastable for wide strings. - * With wide characters it is not possible any more to use a normal - * array because there are too many characters and we could not - * provide enough memory. Fortunately, we only have to store distinct - * values for so many characters as the number of distinct characters - * in the pattern, so we can store them in a hashtable and store a - * default shift value for the rest. - */ -#define FILL_QSBC_WIDE \ - if (fg->reversed) \ - { \ - _FILL_QSBC_WIDE_REVERSED \ - } \ - else \ - { \ - _FILL_QSBC_WIDE \ - } - -#define _FILL_QSBC_WIDE \ - /* Adjust the shift based on location of the last dot ('.'). */ \ - fg->defBc = fg->wlen - whasdot; \ - \ - /* Preprocess pattern. */ \ - fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \ - sizeof(tre_char_t), sizeof(int)); \ - if (!fg->qsBc_table) \ - FAIL_COMP(REG_ESPACE); \ - for (unsigned int i = whasdot + 1; i < fg->wlen; i++) \ - { \ - int k = fg->wlen - i; \ - int r; \ - \ - r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ - if ((r == HASH_FAIL) || (r == HASH_FULL)) \ - FAIL_COMP(REG_ESPACE); \ - DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\ - k)); \ - if (fg->icase) \ - { \ - tre_char_t wc = iswlower(fg->wpattern[i]) ? \ - towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \ - r = hashtable_put(fg->qsBc_table, &wc, &k); \ - if ((r == HASH_FAIL) || (r == HASH_FULL)) \ - FAIL_COMP(REG_ESPACE); \ - DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k)); \ - } \ - } - -#define _FILL_QSBC_WIDE_REVERSED \ - /* Adjust the shift based on location of the last dot ('.'). */ \ - fg->defBc = (size_t)wfirstdot; \ - \ - /* Preprocess pattern. */ \ - fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \ - sizeof(tre_char_t), sizeof(int)); \ - if (!fg->qsBc_table) \ - FAIL_COMP(REG_ESPACE); \ - for (int i = wfirstdot - 1; i >= 0; i--) \ - { \ - int k = i + 1; \ - int r; \ - \ - r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ - if ((r == HASH_FAIL) || (r == HASH_FULL)) \ - FAIL_COMP(REG_ESPACE); \ - DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", \ - fg->wpattern[i], k)); \ - if (fg->icase) \ - { \ - tre_char_t wc = iswlower(fg->wpattern[i]) ? \ - towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \ - r = hashtable_put(fg->qsBc_table, &wc, &k); \ - if ((r == HASH_FAIL) || (r == HASH_FULL)) \ - FAIL_COMP(REG_ESPACE); \ - DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \ - k)); \ - } \ - } - -#ifdef _GREP_DEBUG -#define DPRINT_BMGS(len, fmt_str, sh) \ - for (unsigned int i = 0; i < len; i++) \ - DPRINT((fmt_str, i, sh[i])); -#else -#define DPRINT_BMGS(len, fmt_str, sh) \ - do { } while(/*CONSTCOND*/0) -#endif - -/* - * Fills in the good suffix table for SB/MB strings. - */ -#define FILL_BMGS \ - if (fg->len > 0 && !fg->hasdot) \ - { \ - fg->sbmGs = malloc(fg->len * sizeof(*fg->sbmGs)); \ - if (!fg->sbmGs) \ - return REG_ESPACE; \ - if (fg->len == 1) \ - fg->sbmGs[0] = 1; \ - else \ - _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \ - DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs); \ - } - -/* - * Fills in the good suffix table for wide strings. - */ -#define FILL_BMGS_WIDE \ - if (fg->wlen > 0 && !fg->hasdot) \ - { \ - fg->bmGs = malloc(fg->wlen * sizeof(*fg->bmGs)); \ - if (!fg->bmGs) \ - return REG_ESPACE; \ - if (fg->wlen == 1) \ - fg->bmGs[0] = 1; \ - else \ - _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \ - DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n", \ - fg->bmGs); \ - } - -#define _FILL_BMGS(arr, pat, plen, wide) \ - { \ - char *p; \ - tre_char_t *wp; \ - \ - if (wide) \ - { \ - if (fg->icase) \ - { \ - wp = malloc(plen * sizeof(tre_char_t)); \ - if (wp == NULL) \ - return REG_ESPACE; \ - for (unsigned int i = 0; i < plen; i++) \ - wp[i] = towlower(pat[i]); \ - _CALC_BMGS(arr, wp, plen); \ - free(wp); \ - } \ - else \ - _CALC_BMGS(arr, pat, plen); \ - } \ - else \ - { \ - if (fg->icase) \ - { \ - p = malloc(plen); \ - if (p == NULL) \ - return REG_ESPACE; \ - for (unsigned int i = 0; i < plen; i++) \ - p[i] = tolower((unsigned char)pat[i]); \ - _CALC_BMGS(arr, p, plen); \ - free(p); \ - } \ - else \ - _CALC_BMGS(arr, pat, plen); \ - } \ - } - -#define _CALC_BMGS(arr, pat, plen) \ - { \ - int f = 0, g; \ - \ - int *suff = malloc(plen * sizeof(int)); \ - if (suff == NULL) \ - return REG_ESPACE; \ - \ - suff[plen - 1] = plen; \ - g = plen - 1; \ - for (int i = plen - 2; i >= 0; i--) \ - { \ - if (i > g && suff[i + plen - 1 - f] < i - g) \ - suff[i] = suff[i + plen - 1 - f]; \ - else \ - { \ - if (i < g) \ - g = i; \ - f = i; \ - while (g >= 0 && pat[g] == pat[g + plen - 1 - f]) \ - g--; \ - suff[i] = f - g; \ - } \ - } \ - \ - for (unsigned int i = 0; i < plen; i++) \ - arr[i] = plen; \ - g = 0; \ - for (int i = plen - 1; i >= 0; i--) \ - if (suff[i] == i + 1) \ - for(; (unsigned long)g < plen - 1 - i; g++) \ - if (arr[g] == plen) \ - arr[g] = plen - 1 - i; \ - for (unsigned int i = 0; i <= plen - 2; i++) \ - arr[plen - 1 - suff[i]] = plen - 1 - i; \ - \ - free(suff); \ - } - -/* - * Copies the pattern pat having length n to p and stores - * the size in l. - */ -#define SAVE_PATTERN(src, srclen, dst, dstlen) \ - dstlen = srclen; \ - dst = malloc((dstlen + 1) * sizeof(tre_char_t)); \ - if (dst == NULL) \ - return REG_ESPACE; \ - if (dstlen > 0) \ - memcpy(dst, src, dstlen * sizeof(tre_char_t)); \ - dst[dstlen] = TRE_CHAR('\0'); - -/* - * Initializes pattern compiling. - */ -#define INIT_COMP \ - /* Initialize. */ \ - memset(fg, 0, sizeof(*fg)); \ - fg->icase = (cflags & REG_ICASE); \ - fg->word = (cflags & REG_WORD); \ - fg->newline = (cflags & REG_NEWLINE); \ - fg->nosub = (cflags & REG_NOSUB); \ - \ - /* Cannot handle REG_ICASE with MB string */ \ - if (fg->icase && (TRE_MB_CUR_MAX > 1) && n > 0) \ - { \ - DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n")); \ - return REG_BADPAT; \ - } - -/* - * Checks whether we have a 0-length pattern that will match - * anything. If literal is set to false, the EOL anchor is also - * taken into account. - */ -#define CHECK_MATCHALL(literal) \ - if (!literal && n == 1 && pat[0] == TRE_CHAR('$')) \ - { \ - n--; \ - fg->eol = true; \ - } \ - \ - if (n == 0) \ - { \ - fg->matchall = true; \ - fg->pattern = malloc(sizeof(char)); \ - if (!fg->pattern) \ - FAIL_COMP(REG_ESPACE); \ - fg->pattern[0] = '\0'; \ - fg->wpattern = malloc(sizeof(tre_char_t)); \ - if (!fg->wpattern) \ - FAIL_COMP(REG_ESPACE); \ - fg->wpattern[0] = TRE_CHAR('\0'); \ - DPRINT(("Matching every input\n")); \ - return REG_OK; \ - } - -/* - * Returns: REG_OK on success, error code otherwise - */ -int -tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n, - int cflags) -{ - size_t hasdot = 0, whasdot = 0; - ssize_t firstdot = -1, wfirstdot = -1; - - INIT_COMP; - - CHECK_MATCHALL(true); - - /* Cannot handle word boundaries with MB string */ - if (fg->word && (TRE_MB_CUR_MAX > 1)) - return REG_BADPAT; - -#ifdef TRE_WCHAR - SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen); - STORE_MBS_PAT; -#else - SAVE_PATTERN(pat, n, fg->pattern, fg->len); -#endif - - DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, " - "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n', - fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n')); - - FILL_QSBC; - FILL_BMGS; -#ifdef TRE_WCHAR - FILL_QSBC_WIDE; - FILL_BMGS_WIDE; -#endif - - return REG_OK; -} - -/* - * Returns: REG_OK on success, error code otherwise - */ -int -tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n, - int cflags) -{ - tre_char_t *tmp; - size_t pos = 0, hasdot = 0, whasdot = 0; - ssize_t firstdot = -1, wfirstdot = -1; - bool escaped = false; - bool *_escmap = NULL; - - INIT_COMP; - - /* Remove beginning-of-line character ('^'). */ - if (pat[0] == TRE_CHAR('^')) - { - fg->bol = true; - n--; - pat++; - } - - CHECK_MATCHALL(false); - - /* Handle word-boundary matching when GNU extensions are enabled */ - if ((cflags & REG_GNU) && (n >= 14) && - (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) && - (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"), - 7 * sizeof(tre_char_t)) == 0)) - { - n -= 14; - pat += 7; - fg->word = true; - } - - /* Cannot handle word boundaries with MB string */ - if (fg->word && (TRE_MB_CUR_MAX > 1)) - return REG_BADPAT; - - tmp = malloc((n + 1) * sizeof(tre_char_t)); - if (tmp == NULL) - return REG_ESPACE; - -/* Copies the char into the stored pattern and skips to the next char. */ -#define STORE_CHAR \ - do \ - { \ - tmp[pos++] = pat[i]; \ - escaped = false; \ - continue; \ - } while (0) - - /* Traverse the input pattern for processing */ - for (unsigned int i = 0; i < n; i++) - { - switch (pat[i]) - { - case TRE_CHAR('\\'): - if (escaped) - STORE_CHAR; - else if (i == n - 1) - goto badpat; - else - escaped = true; - continue; - case TRE_CHAR('['): - if (escaped) - STORE_CHAR; - else - goto badpat; - continue; - case TRE_CHAR('*'): - if (escaped || (!(cflags & REG_EXTENDED) && (i == 0))) - STORE_CHAR; - else - goto badpat; - continue; - case TRE_CHAR('+'): - case TRE_CHAR('?'): - if ((cflags & REG_EXTENDED) && (i == 0)) - goto badpat; - else if ((cflags & REG_EXTENDED) ^ !escaped) - STORE_CHAR; - else - goto badpat; - continue; - case TRE_CHAR('.'): - if (escaped) - { - if (!_escmap) - _escmap = calloc(n, sizeof(bool)); - if (!_escmap) - { - free(tmp); - return REG_ESPACE; - } - _escmap[i] = true; - STORE_CHAR; - } - else - { - whasdot = i; - if (wfirstdot == -1) - wfirstdot = i; - STORE_CHAR; - } - continue; - case TRE_CHAR('^'): - STORE_CHAR; - continue; - case TRE_CHAR('$'): - if (!escaped && (i == n - 1)) - fg->eol = true; - else - STORE_CHAR; - continue; - case TRE_CHAR('('): - if ((cflags & REG_EXTENDED) ^ escaped) - goto badpat; - else - STORE_CHAR; - continue; - case TRE_CHAR('{'): - if (!(cflags & REG_EXTENDED) ^ escaped) - STORE_CHAR; - else if (!(cflags & REG_EXTENDED) && (i == 0)) - STORE_CHAR; - else if ((cflags & REG_EXTENDED) && (i == 0)) - continue; - else - goto badpat; - continue; - case TRE_CHAR('|'): - if ((cflags & REG_EXTENDED) ^ escaped) - goto badpat; - else - STORE_CHAR; - continue; - default: - if (escaped) - goto badpat; - else - STORE_CHAR; - continue; - } - continue; -badpat: - free(tmp); - DPRINT(("tre_compile_fast: compilation of pattern failed, falling" - "back to NFA\n")); - return REG_BADPAT; - } - - fg->hasdot = wfirstdot > -1; - - /* - * The pattern has been processed and copied to tmp as a literal string - * with escapes, anchors (^$) and the word boundary match character - * classes stripped out. - */ -#ifdef TRE_WCHAR - SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen); - fg->wescmap = _escmap; - STORE_MBS_PAT; - - /* - * The position of dots and escaped dots is different in the MB string - * than in to the wide string so traverse the converted string, as well, - * to store these positions. - */ - if (fg->hasdot || (fg->wescmap != NULL)) - { - if (fg->wescmap != NULL) - { - fg->escmap = calloc(fg->len, sizeof(bool)); - if (fg->escmap == NULL) - { - tre_free_fast(fg); - return REG_ESPACE; - } - } - - escaped = false; - char *_checkpat = NULL; - size_t _checklen = 0; - unsigned int escofs = 0; - /* - * Make a copy here of the original pattern, because fg->pattern has - * already been stripped of all escape sequences in the above processing. - * This is necessary if we wish to later treat fg->escmap as an actual, - * functional replacement of fg->wescmap. - */ - CONV_MBS_PAT(pat, _checkpat, _checklen); - for (unsigned int i = 0; i < n; i++) - if (_checkpat[i] == '\\') - { - escaped = !escaped; - if (escaped) - ++escofs; - } - else if (_checkpat[i] == '.' && fg->escmap != NULL && escaped) - { - fg->escmap[i - escofs] = true; - escaped = false; - } - else if (_checkpat[i] == '.' && !escaped) - { - hasdot = i; - if (firstdot == -1) - firstdot = i; - } - else - escaped = false; - free(_checkpat); - } -#else - SAVE_PATTERN(tmp, pos, fg->pattern, fg->len); - fg->escmap = _escmap; -#endif - - free(tmp); - - DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, " - "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len, - fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n', - fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n', - fg->newline ? 'y' : 'n')); - - /* Check whether reverse QS algorithm is more efficient */ - if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) && - fg->nosub) - { - fg->reversed = true; - DPRINT(("tre_compile_fast: using reverse QS algorithm\n")); - } - - FILL_QSBC; - FILL_BMGS; -#ifdef TRE_WCHAR - FILL_QSBC_WIDE; - FILL_BMGS_WIDE; -#endif - - return REG_OK; -} - -#define _SHIFT_ONE \ - { \ - shift = 1; \ - j = !fg->reversed ? j + shift : j - shift; \ - continue; \ - } - -#define _BBOUND_COND \ - ((type == STR_WIDE) ? \ - ((j == 0) || !(tre_isalnum(str_wide[j - 1]) || \ - (str_wide[j - 1] == TRE_CHAR('_')))) : \ - ((j == 0) || !(tre_isalnum(str_byte[j - 1]) || \ - (str_byte[j - 1] == '_')))) - -#define _EBOUND_COND \ - ((type == STR_WIDE) ? \ - ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) || \ - (str_wide[j + fg->wlen] == TRE_CHAR('_')))) : \ - ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) || \ - (str_byte[j + fg->len] == '_')))) - -/* - * Condition to check whether the match on position j is on a - * word boundary. - */ -#define IS_ON_WORD_BOUNDARY \ - (_BBOUND_COND && _EBOUND_COND) - -/* - * Checks word boundary and shifts one if match is not on a - * boundary. - */ -#define CHECK_WORD_BOUNDARY \ - if (!IS_ON_WORD_BOUNDARY) \ - _SHIFT_ONE; - -#define _BOL_COND \ - ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\ - : (str_byte[j - 1] == '\n'))) - -/* - * Checks BOL anchor and shifts one if match is not on a - * boundary. - */ -#define CHECK_BOL_ANCHOR \ - if (!_BOL_COND) \ - _SHIFT_ONE; - -#define _EOL_COND \ - ((type == STR_WIDE) \ - ? ((j + fg->wlen == len) || \ - (str_wide[j + fg->wlen] == TRE_CHAR('\n'))) \ - : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n'))) - -/* - * Checks EOL anchor and shifts one if match is not on a - * boundary. - */ -#define CHECK_EOL_ANCHOR \ - if (!_EOL_COND) \ - _SHIFT_ONE; - -/* - * Executes matching of the precompiled pattern on the input string. - * Returns REG_OK or REG_NOMATCH depending on if we find a match or not. - */ -int -tre_match_fast(const fastmatch_t *fg, const void *data, size_t len, - tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags) -{ - unsigned int shift, u = 0, v = 0; - ssize_t j = 0; - int ret = REG_NOMATCH; - int mismatch; - const char *str_byte = data; - const void *startptr = NULL; - const tre_char_t *str_wide = data; - - /* Calculate length if unspecified. */ - if (len == (size_t)-1) - switch (type) - { - case STR_WIDE: - len = tre_strlen(str_wide); - break; - default: - len = strlen(str_byte); - break; - } - - /* Shortcut for empty pattern */ - if (fg->matchall) - { - if (!fg->nosub && nmatch >= 1) - { - pmatch[0].rm_so = 0; - pmatch[0].rm_eo = len; - } - if (fg->bol && fg->eol) - return (len == 0) ? REG_OK : REG_NOMATCH; - else - return REG_OK; - } - - /* No point in going farther if we do not have enough data. */ - switch (type) - { - case STR_WIDE: - if (len < fg->wlen) - return ret; - shift = fg->wlen; - break; - default: - if (len < fg->len) - return ret; - shift = fg->len; - } - - /* - * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we - * can shift one because there can't be a match at the beginning. - */ - if (fg->bol && (eflags & REG_NOTBOL)) - j = 1; - - /* - * Like above, we cannot have a match at the very end when anchoring to - * the end and REG_NOTEOL is specified. - */ - if (fg->eol && (eflags & REG_NOTEOL)) - len--; - - if (fg->reversed) - j = len - (type == STR_WIDE ? fg->wlen : fg->len); - - - /* Only try once at the beginning or ending of the line. */ - if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) && - !(eflags & REG_NOTEOL)) - { - /* Simple text comparison. */ - if (!((fg->bol && fg->eol) && - (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len)))) - { - /* Determine where in data to start search at. */ - j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0; - SKIP_CHARS(j); - mismatch = fastcmp(fg, startptr, type); - if (mismatch == REG_OK) - { - if (fg->word && !IS_ON_WORD_BOUNDARY) - return ret; - if (!fg->nosub && nmatch >= 1) - { - pmatch[0].rm_so = j; - pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len); - } - return REG_OK; - } - } - } - else - { - /* Quick Search / Turbo Boyer-Moore algorithm. */ - do - { - SKIP_CHARS(j); - mismatch = fastcmp(fg, startptr, type); - if (mismatch == REG_OK) - { - if (fg->word) - CHECK_WORD_BOUNDARY; - if (fg->bol) - CHECK_BOL_ANCHOR; - if (fg->eol) - CHECK_EOL_ANCHOR; - if (!fg->nosub && nmatch >= 1) - { - pmatch[0].rm_so = j; - pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len); - } - return REG_OK; - } - else if (mismatch > 0) - return mismatch; - mismatch = -mismatch - 1; - SHIFT; - } while (!IS_OUT_OF_BOUNDS); - } - return ret; -} - -/* - * Frees the resources that were allocated when the pattern was compiled. - */ -void -tre_free_fast(fastmatch_t *fg) -{ - - DPRINT(("tre_fast_free: freeing structures for pattern %s\n", - fg->pattern)); - -#ifdef TRE_WCHAR - hashtable_free(fg->qsBc_table); - if (!fg->hasdot) - free(fg->bmGs); - if (fg->wescmap) - free(fg->wescmap); - free(fg->wpattern); -#endif - if (!fg->hasdot) - free(fg->sbmGs); - if (fg->escmap) - free(fg->escmap); - free(fg->pattern); -} - -/* - * Returns: -(i + 1) on failure (position that it failed with minus sign) - * error code on error - * REG_OK on success - */ -static inline int -fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type) -{ - const char *str_byte = data; - const char *pat_byte = fg->pattern; - const tre_char_t *str_wide = data; - const tre_char_t *pat_wide = fg->wpattern; - const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap; - size_t len = (type == STR_WIDE) ? fg->wlen : fg->len; - int ret = REG_OK; - - /* Compare the pattern and the input char-by-char from the last position. */ - for (int i = len - 1; i >= 0; i--) { - switch (type) - { - case STR_WIDE: - - /* Check dot */ - if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') && - (!escmap || !escmap[i]) && - (!fg->newline || (str_wide[i] != TRE_CHAR('\n')))) - continue; - - /* Compare */ - if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i])) - : (pat_wide[i] == str_wide[i])) - continue; - break; - default: - /* Check dot */ - if (fg->hasdot && pat_byte[i] == '.' && - (!escmap || !escmap[i]) && - (!fg->newline || (str_byte[i] != '\n'))) - continue; - - /* Compare */ - if (fg->icase ? (tolower((unsigned char)pat_byte[i]) == tolower((unsigned char)str_byte[i])) - : (pat_byte[i] == str_byte[i])) - continue; - } - DPRINT(("fastcmp: mismatch at position %d\n", i)); - ret = -(i + 1); - break; - } - return ret; -} diff --git a/usr.bin/grep/regex/tre-fastmatch.h b/usr.bin/grep/regex/tre-fastmatch.h deleted file mode 100644 index f72397c58c33..000000000000 --- a/usr.bin/grep/regex/tre-fastmatch.h +++ /dev/null @@ -1,21 +0,0 @@ -/* $FreeBSD$ */ - -#ifndef TRE_FASTMATCH_H -#define TRE_FASTMATCH_H 1 - -#include -#include -#include -#include -#include - -#include "hashtable.h" - -int tre_compile_literal(fastmatch_t *preg, const tre_char_t *regex, - size_t, int); -int tre_compile_fast(fastmatch_t *preg, const tre_char_t *regex, size_t, int); -int tre_match_fast(const fastmatch_t *fg, const void *data, size_t len, - tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags); -void tre_free_fast(fastmatch_t *preg); - -#endif /* TRE_FASTMATCH_H */ diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c index 695853a6be2f..e53558e5e0ba 100644 --- a/usr.bin/grep/util.c +++ b/usr.bin/grep/util.c @@ -52,9 +52,6 @@ __FBSDID("$FreeBSD$"); #include #include -#ifndef WITHOUT_FASTMATCH -#include "fastmatch.h" -#endif #include "grep.h" static bool first_match = true; @@ -512,14 +509,8 @@ procline(struct parsec *pc) r = litexec(&pattern[i], pc->ln.dat, 1, &pmatch); else #endif -#ifndef WITHOUT_FASTMATCH - if (fg_pattern[i].pattern) - r = fastexec(&fg_pattern[i], - pc->ln.dat, 1, &pmatch, leflags); - else -#endif - r = regexec(&r_pattern[i], pc->ln.dat, 1, - &pmatch, leflags); + r = regexec(&r_pattern[i], pc->ln.dat, 1, &pmatch, + leflags); if (r != 0) continue; /* Check for full match */ @@ -527,11 +518,7 @@ procline(struct parsec *pc) (size_t)pmatch.rm_eo != pc->ln.len)) continue; /* Check for whole word match */ -#ifndef WITHOUT_FASTMATCH - if (wflag || fg_pattern[i].word) { -#else if (wflag) { -#endif wbegin = wend = L' '; if (pmatch.rm_so != 0 && sscanf(&pc->ln.dat[pmatch.rm_so - 1],