From c66bbc91434501b889413241e8f1bd1487a92c5b Mon Sep 17 00:00:00 2001 From: Gabor Kovesdan Date: Fri, 11 May 2012 12:37:16 +0000 Subject: [PATCH] Add a BSD-licensed sort rewrite that was started by me and later completed with the major functionality and optimizations by Oleg Moskalenko. It is compatible with the latest version of POSIX and the current GNU sort version that we have in base. Beside this, it implements all the functionality introduced in later versions of GNU sort. For now, it will be installed as "bsdsort", keeping GNU sort as the default sort implementation. --- usr.bin/sort/Makefile | 45 + usr.bin/sort/bwstring.c | 1138 ++++++++++++++++++ usr.bin/sort/bwstring.h | 142 +++ usr.bin/sort/coll.c | 1307 +++++++++++++++++++++ usr.bin/sort/coll.h | 167 +++ usr.bin/sort/file.c | 1615 ++++++++++++++++++++++++++ usr.bin/sort/file.h | 132 +++ usr.bin/sort/mem.c | 82 ++ usr.bin/sort/mem.h | 45 + usr.bin/sort/nls/C.msg | 10 + usr.bin/sort/nls/hu_HU.ISO8859-2.msg | 10 + usr.bin/sort/radixsort.c | 674 +++++++++++ usr.bin/sort/radixsort.h | 38 + usr.bin/sort/sort.1.in | 635 ++++++++++ usr.bin/sort/sort.c | 1324 +++++++++++++++++++++ usr.bin/sort/sort.h | 129 ++ usr.bin/sort/vsort.c | 265 +++++ usr.bin/sort/vsort.h | 37 + 18 files changed, 7795 insertions(+) create mode 100644 usr.bin/sort/Makefile create mode 100644 usr.bin/sort/bwstring.c create mode 100644 usr.bin/sort/bwstring.h create mode 100644 usr.bin/sort/coll.c create mode 100644 usr.bin/sort/coll.h create mode 100644 usr.bin/sort/file.c create mode 100644 usr.bin/sort/file.h create mode 100644 usr.bin/sort/mem.c create mode 100644 usr.bin/sort/mem.h create mode 100644 usr.bin/sort/nls/C.msg create mode 100644 usr.bin/sort/nls/hu_HU.ISO8859-2.msg create mode 100644 usr.bin/sort/radixsort.c create mode 100644 usr.bin/sort/radixsort.h create mode 100644 usr.bin/sort/sort.1.in create mode 100644 usr.bin/sort/sort.c create mode 100644 usr.bin/sort/sort.h create mode 100644 usr.bin/sort/vsort.c create mode 100644 usr.bin/sort/vsort.h diff --git a/usr.bin/sort/Makefile b/usr.bin/sort/Makefile new file mode 100644 index 000000000000..162df21cd5e1 --- /dev/null +++ b/usr.bin/sort/Makefile @@ -0,0 +1,45 @@ +# $FreeBSD$ + +.include + +.if ${MK_BSD_SORT} == "yes" +PROG= sort +.else +PROG= bsdsort +CLEANFILES+= bsdsort.1 + +bsdsort.1: sort.1 + cp ${.ALLSRC} ${.TARGET} +.endif + +SRCS= bwstring.c coll.c file.c mem.c radixsort.c sort.c vsort.c + +WARNS= 6 + +sort.1: sort.1.in + /usr/bin/sed ${MAN_SUB} ${.ALLSRC} >${.TARGET} + +CLEANFILES+= sort.1 + +.if !defined(WITHOUT_THREADS) +CFLAGS+= -DSORT_THREADS +LDFLAGS+= -lpthread -lmd +MAN_SUB+= -e 's|%%THREADS%%||g' +.else +LDFLAGS+= -lmd +MAN_SUB+= -e 's|%%THREADS%%|\.\\"|g' +.endif + +.if !defined(WITHOUT_NLS) +NLS+= hu_HU.ISO8859-2 +NLSSRCFILES= ${NLS:S@$@.msg@} +MAN_SUB+= -e 's|%%NLS%%||g' +.for lang in ${NLS} +NLSSRCDIR_${lang}= ${.CURDIR}/nls +.endfor +.else +CFLAGS+= -DWITHOUT_NLS +MAN_SUB+= -e 's|%%THREADS%%|\.\\"|g' +.endif + +.include diff --git a/usr.bin/sort/bwstring.c b/usr.bin/sort/bwstring.c new file mode 100644 index 000000000000..20587de10677 --- /dev/null +++ b/usr.bin/sort/bwstring.c @@ -0,0 +1,1138 @@ +/*- + * Copyright (C) 2009 Gabor Kovesdan + * Copyright (C) 2012 Oleg Moskalenko + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bwstring.h" +#include "sort.h" + +bool byte_sort = false; + +static wchar_t **wmonths = NULL; +static unsigned char **cmonths = NULL; + +/* initialise months */ + +void +initialise_months(void) +{ + const nl_item item[12] = { ABMON_1, ABMON_2, ABMON_3, ABMON_4, + ABMON_5, ABMON_6, ABMON_7, ABMON_8, ABMON_9, ABMON_10, + ABMON_11, ABMON_12 }; + unsigned char *tmp; + size_t len; + + if (MB_CUR_MAX == 1) { + if (cmonths == NULL) { + unsigned char *m; + + cmonths = sort_malloc(sizeof(unsigned char*) * 12); + for (int i = 0; i < 12; i++) { + cmonths[i] = NULL; + tmp = (unsigned char *) nl_langinfo(item[i]); + if (tmp == NULL) + continue; + if (debug_sort) + printf("month[%d]=%s\n", i, tmp); + len = strlen(tmp); + if (len < 1) + continue; + while (isblank(*tmp)) + ++tmp; + m = sort_malloc(len + 1); + memcpy(m, tmp, len + 1); + m[len] = '\0'; + for (unsigned int j = 0; j < len; j++) + m[j] = toupper(m[j]); + cmonths[i] = m; + } + } + + } else { + if (wmonths == NULL) { + wchar_t *m; + + wmonths = sort_malloc(sizeof(wchar_t *) * 12); + for (int i = 0; i < 12; i++) { + wmonths[i] = NULL; + tmp = (unsigned char *) nl_langinfo(item[i]); + if (tmp == NULL) + continue; + if (debug_sort) + printf("month[%d]=%s\n", i, tmp); + len = strlen(tmp); + if (len < 1) + continue; + while (isblank(*tmp)) + ++tmp; + m = sort_malloc(SIZEOF_WCHAR_STRING(len + 1)); + if (mbstowcs(m, tmp, len) == ((size_t) -1)) + continue; + m[len] = L'\0'; + for (unsigned int j = 0; j < len; j++) + m[j] = towupper(m[j]); + wmonths[i] = m; + } + } + } +} + +/* + * Compare two wide-character strings + */ +static int +wide_str_coll(const wchar_t *s1, const wchar_t *s2) +{ + int ret = 0; + + errno = 0; + ret = wcscoll(s1, s2); + if (errno == EILSEQ) { + errno = 0; + ret = wcscmp(s1, s2); + if (errno != 0) { + for (size_t i = 0; ; ++i) { + wchar_t c1 = s1[i]; + wchar_t c2 = s2[i]; + if (c1 == L'\0') + return ((c2 == L'\0') ? 0 : -1); + if (c2 == L'\0') + return (+1); + if (c1 == c2) + continue; + return ((int)(c1 - c2)); + } + } + } + return (ret); +} + +/* counterparts of wcs functions */ + +void +bwsprintf(FILE *f, struct bwstring *bws, const char *prefix, const char *suffix) +{ + + if (MB_CUR_MAX == 1) + fprintf(f, "%s%s%s", prefix, bws->data.cstr, suffix); + else + fprintf(f, "%s%S%s", prefix, bws->data.wstr, suffix); +} + +const void* bwsrawdata(const struct bwstring *bws) +{ + + return (&(bws->data)); +} + +size_t bwsrawlen(const struct bwstring *bws) +{ + + return ((MB_CUR_MAX == 1) ? bws->len : SIZEOF_WCHAR_STRING(bws->len)); +} + +size_t +bws_memsize(const struct bwstring *bws) +{ + + return ((MB_CUR_MAX == 1) ? (bws->len + 2 + sizeof(struct bwstring)) : + (SIZEOF_WCHAR_STRING(bws->len + 1) + sizeof(struct bwstring))); +} + +void +bws_setlen(struct bwstring *bws, size_t newlen) +{ + + if (bws && newlen != bws->len && newlen <= bws->len) { + bws->len = newlen; + if (MB_CUR_MAX == 1) + bws->data.cstr[newlen] = '\0'; + else + bws->data.wstr[newlen] = L'\0'; + } +} + +/* + * Allocate a new binary string of specified size + */ +struct bwstring * +bwsalloc(size_t sz) +{ + struct bwstring *ret; + + if (MB_CUR_MAX == 1) + ret = sort_malloc(sizeof(struct bwstring) + 1 + sz); + else + ret = sort_malloc(sizeof(struct bwstring) + + SIZEOF_WCHAR_STRING(sz + 1)); + ret->len = sz; + + if (MB_CUR_MAX == 1) + ret->data.cstr[ret->len] = '\0'; + else + ret->data.wstr[ret->len] = L'\0'; + + return (ret); +} + +/* + * Create a copy of binary string. + * New string size equals the length of the old string. + */ +struct bwstring * +bwsdup(const struct bwstring *s) +{ + + if (s == NULL) + return (NULL); + else { + struct bwstring *ret = bwsalloc(s->len); + + if (MB_CUR_MAX == 1) + memcpy(ret->data.cstr, s->data.cstr, (s->len)); + else + memcpy(ret->data.wstr, s->data.wstr, + SIZEOF_WCHAR_STRING(s->len)); + + return (ret); + } +} + +/* + * Create a new binary string from a raw binary buffer. + */ +struct bwstring * +bwssbdup(const wchar_t *str, size_t len) +{ + + if (str == NULL) + return ((len == 0) ? bwsalloc(0) : NULL); + else { + struct bwstring *ret; + + ret = bwsalloc(len); + + if (MB_CUR_MAX == 1) + for (size_t i = 0; i < len; ++i) + ret->data.cstr[i] = (unsigned char) str[i]; + else + memcpy(ret->data.wstr, str, SIZEOF_WCHAR_STRING(len)); + + return (ret); + } +} + +/* + * Create a new binary string from a raw binary buffer. + */ +struct bwstring * +bwscsbdup(const unsigned char *str, size_t len) +{ + struct bwstring *ret; + + ret = bwsalloc(len); + + if (str) { + if (MB_CUR_MAX == 1) + memcpy(ret->data.cstr, str, len); + else { + mbstate_t mbs; + const char *s; + size_t charlen, chars, cptr; + + charlen = chars = 0; + cptr = 0; + s = (const char *) str; + + memset(&mbs, 0, sizeof(mbs)); + + while (cptr < len) { + size_t n = MB_CUR_MAX; + + if (n > len - cptr) + n = len - cptr; + charlen = mbrlen(s + cptr, n, &mbs); + switch (charlen) { + case 0: + /* FALLTHROUGH */ + case (size_t) -1: + /* FALLTHROUGH */ + case (size_t) -2: + ret->data.wstr[chars++] = + (unsigned char) s[cptr]; + ++cptr; + break; + default: + n = mbrtowc(ret->data.wstr + (chars++), + s + cptr, charlen, &mbs); + if ((n == (size_t)-1) || (n == (size_t)-2)) + /* NOTREACHED */ + err(2, "mbrtowc error"); + cptr += charlen; + }; + } + + ret->len = chars; + ret->data.wstr[ret->len] = L'\0'; + } + } + return (ret); +} + +/* + * De-allocate object memory + */ +void +bwsfree(const struct bwstring *s) +{ + + if (s) + sort_free(s); +} + +/* + * Copy content of src binary string to dst. + * If the capacity of the dst string is not sufficient, + * then the data is truncated. + */ +size_t +bwscpy(struct bwstring *dst, const struct bwstring *src) +{ + size_t nums = src->len; + + if (nums > dst->len) + nums = dst->len; + dst->len = nums; + + if (MB_CUR_MAX == 1) { + memcpy(dst->data.cstr, src->data.cstr, nums); + dst->data.cstr[dst->len] = '\0'; + } else { + memcpy(dst->data.wstr, src->data.wstr, + SIZEOF_WCHAR_STRING(nums + 1)); + dst->data.wstr[dst->len] = L'\0'; + } + + return (nums); +} + +/* + * Copy content of src binary string to dst, + * with specified number of symbols to be copied. + * If the capacity of the dst string is not sufficient, + * then the data is truncated. + */ +struct bwstring * +bwsncpy(struct bwstring *dst, const struct bwstring *src, size_t size) +{ + size_t nums = src->len; + + if (nums > dst->len) + nums = dst->len; + if (nums > size) + nums = size; + dst->len = nums; + + if (MB_CUR_MAX == 1) { + memcpy(dst->data.cstr, src->data.cstr, nums); + dst->data.cstr[dst->len] = '\0'; + } else { + memcpy(dst->data.wstr, src->data.wstr, + SIZEOF_WCHAR_STRING(nums + 1)); + dst->data.wstr[dst->len] = L'\0'; + } + + return (dst); +} + +/* + * Copy content of src binary string to dst, + * with specified number of symbols to be copied. + * An offset value can be specified, from the start of src string. + * If the capacity of the dst string is not sufficient, + * then the data is truncated. + */ +struct bwstring * +bwsnocpy(struct bwstring *dst, const struct bwstring *src, size_t offset, + size_t size) +{ + + if (offset >= src->len) { + dst->data.wstr[0] = 0; + dst->len = 0; + } else { + size_t nums = src->len - offset; + + if (nums > dst->len) + nums = dst->len; + if (nums > size) + nums = size; + dst->len = nums; + if (MB_CUR_MAX == 1) { + memcpy(dst->data.cstr, src->data.cstr + offset, + (nums)); + dst->data.cstr[dst->len] = '\0'; + } else { + memcpy(dst->data.wstr, src->data.wstr + offset, + SIZEOF_WCHAR_STRING(nums)); + dst->data.wstr[dst->len] = L'\0'; + } + } + return (dst); +} + +/* + * Write binary string to the file. + * The output is ended either with '\n' (nl == true) + * or '\0' (nl == false). + */ +int +bwsfwrite(struct bwstring *bws, FILE *f, bool zero_ended) +{ + + if (MB_CUR_MAX == 1) { + size_t len = bws->len; + + if (!zero_ended) { + bws->data.cstr[len] = '\n'; + + if (fwrite(bws->data.cstr, len + 1, 1, f) < 1) + err(2, NULL); + + bws->data.cstr[len] = '\0'; + } else if (fwrite(bws->data.cstr, len + 1, 1, f) < 1) + err(2, NULL); + + return (len + 1); + + } else { + wchar_t eols; + int printed = 0; + + eols = zero_ended ? btowc('\0') : btowc('\n'); + + while (printed < (int) BWSLEN(bws)) { + const wchar_t *s = bws->data.wstr + printed; + + if (*s == L'\0') { + int nums; + + nums = fwprintf(f, L"%lc", *s); + + if (nums != 1) + err(2, NULL); + ++printed; + } else { + int nums; + + nums = fwprintf(f, L"%ls", s); + + if (nums < 1) + err(2, NULL); + printed += nums; + } + } + fwprintf(f, L"%lc", eols); + return (printed + 1); + } +} + +/* + * Allocate and read a binary string from file. + * The strings are nl-ended or zero-ended, depending on the sort setting. + */ +struct bwstring * +bwsfgetln(FILE *f, size_t *len, bool zero_ended, struct reader_buffer *rb) +{ + wchar_t eols; + + eols = zero_ended ? btowc('\0') : btowc('\n'); + + if (!zero_ended && (MB_CUR_MAX > 1)) { + wchar_t *ret; + + ret = fgetwln(f, len); + + if (ret == NULL) { + if (!feof(f)) + err(2, NULL); + return (NULL); + } + if (*len > 0) { + if (ret[*len - 1] == eols) + --(*len); + } + return (bwssbdup(ret, *len)); + + } else { + wchar_t c = 0; + + *len = 0; + + if (feof(f)) + return (NULL); + + if (2 >= rb->fgetwln_z_buffer_size) { + rb->fgetwln_z_buffer_size += 256; + rb->fgetwln_z_buffer = sort_realloc(rb->fgetwln_z_buffer, + sizeof(wchar_t) * rb->fgetwln_z_buffer_size); + } + rb->fgetwln_z_buffer[*len] = 0; + + if (MB_CUR_MAX == 1) + while (!feof(f)) { + c = fgetc(f); + + if (c == EOF) { + if (*len == 0) + return (NULL); + goto line_read_done; + } + if (c == eols) + goto line_read_done; + + if (*len + 1 >= rb->fgetwln_z_buffer_size) { + rb->fgetwln_z_buffer_size += 256; + rb->fgetwln_z_buffer = sort_realloc(rb->fgetwln_z_buffer, + SIZEOF_WCHAR_STRING(rb->fgetwln_z_buffer_size)); + } + + rb->fgetwln_z_buffer[*len] = c; + rb->fgetwln_z_buffer[++(*len)] = 0; + } + else + while (!feof(f)) { + c = fgetwc(f); + + if (c == WEOF) { + if (*len == 0) + return (NULL); + goto line_read_done; + } + if (c == eols) + goto line_read_done; + + if (*len + 1 >= rb->fgetwln_z_buffer_size) { + rb->fgetwln_z_buffer_size += 256; + rb->fgetwln_z_buffer = sort_realloc(rb->fgetwln_z_buffer, + SIZEOF_WCHAR_STRING(rb->fgetwln_z_buffer_size)); + } + + rb->fgetwln_z_buffer[*len] = c; + rb->fgetwln_z_buffer[++(*len)] = 0; + } + +line_read_done: + /* we do not count the last 0 */ + return (bwssbdup(rb->fgetwln_z_buffer, *len)); + } +} + +int +bwsncmp(const struct bwstring *bws1, const struct bwstring *bws2, + size_t offset, size_t len) +{ + size_t cmp_len, len1, len2; + int res = 0; + + cmp_len = 0; + len1 = bws1->len; + len2 = bws2->len; + + if (len1 <= offset) { + return ((len2 <= offset) ? 0 : -1); + } else { + if (len2 <= offset) + return (+1); + else { + len1 -= offset; + len2 -= offset; + + cmp_len = len1; + + if (len2 < cmp_len) + cmp_len = len2; + + if (len < cmp_len) + cmp_len = len; + + if (MB_CUR_MAX == 1) { + const unsigned char *s1, *s2; + + s1 = bws1->data.cstr + offset; + s2 = bws2->data.cstr + offset; + + res = memcmp(s1, s2, cmp_len); + + } else { + const wchar_t *s1, *s2; + + s1 = bws1->data.wstr + offset; + s2 = bws2->data.wstr + offset; + + res = memcmp(s1, s2, SIZEOF_WCHAR_STRING(cmp_len)); + } + } + } + + if (res == 0) { + if (len1 < cmp_len && len1 < len2) + res = -1; + else if (len2 < cmp_len && len2 < len1) + res = +1; + } + + return (res); +} + +int +bwscmp(const struct bwstring *bws1, const struct bwstring *bws2, size_t offset) +{ + size_t len1, len2, cmp_len; + int res; + + len1 = bws1->len; + len2 = bws2->len; + + len1 -= offset; + len2 -= offset; + + cmp_len = len1; + + if (len2 < cmp_len) + cmp_len = len2; + + res = bwsncmp(bws1, bws2, offset, cmp_len); + + if (res == 0) { + if( len1 < len2) + res = -1; + else if (len2 < len1) + res = +1; + } + + return (res); +} + +int +bws_iterator_cmp(bwstring_iterator iter1, bwstring_iterator iter2, size_t len) +{ + wchar_t c1, c2; + size_t i = 0; + + for (i = 0; i < len; ++i) { + c1 = bws_get_iter_value(iter1); + c2 = bws_get_iter_value(iter2); + if (c1 != c2) + return (c1 - c2); + iter1 = bws_iterator_inc(iter1, 1); + iter2 = bws_iterator_inc(iter2, 1); + } + + return (0); +} + +int +bwscoll(const struct bwstring *bws1, const struct bwstring *bws2, size_t offset) +{ + size_t len1, len2; + + len1 = bws1->len; + len2 = bws2->len; + + if (len1 <= offset) + return ((len2 <= offset) ? 0 : -1); + else { + if (len2 <= offset) + return (+1); + else { + len1 -= offset; + len2 -= offset; + + if (MB_CUR_MAX == 1) { + const unsigned char *s1, *s2; + + s1 = bws1->data.cstr + offset; + s2 = bws2->data.cstr + offset; + + if (byte_sort) { + int res = 0; + + if (len1 > len2) { + res = memcmp(s1, s2, len2); + if (!res) + res = +1; + } else if (len1 < len2) { + res = memcmp(s1, s2, len1); + if (!res) + res = -1; + } else + res = memcmp(s1, s2, len1); + + return (res); + + } else { + int res = 0; + size_t i, maxlen; + + i = 0; + maxlen = len1; + + if (maxlen > len2) + maxlen = len2; + + while (i < maxlen) { + /* goto next non-zero part: */ + while ((i < maxlen) && + !s1[i] && !s2[i]) + ++i; + + if (i >= maxlen) + break; + + if (s1[i] == 0) { + if (s2[i] == 0) + /* NOTREACHED */ + err(2, "bwscoll error 01"); + else + return (-1); + } else if (s2[i] == 0) + return (+1); + + res = strcoll(s1 + i, s2 + i); + if (res) + return (res); + + while ((i < maxlen) && + s1[i] && s2[i]) + ++i; + + if (i >= maxlen) + break; + + if (s1[i] == 0) { + if (s2[i] == 0) { + ++i; + continue; + } else + return (-1); + } else if (s2[i] == 0) + return (+1); + else + /* NOTREACHED */ + err(2, "bwscoll error 02"); + } + + if (len1 < len2) + return (-1); + else if (len1 > len2) + return (+1); + + return (0); + } + } else { + const wchar_t *s1, *s2; + size_t i, maxlen; + int res = 0; + + s1 = bws1->data.wstr + offset; + s2 = bws2->data.wstr + offset; + + i = 0; + maxlen = len1; + + if (maxlen > len2) + maxlen = len2; + + while (i < maxlen) { + + /* goto next non-zero part: */ + while ((i < maxlen) && + !s1[i] && !s2[i]) + ++i; + + if (i >= maxlen) + break; + + if (s1[i] == 0) { + if (s2[i] == 0) + /* NOTREACHED */ + err(2, "bwscoll error 1"); + else + return (-1); + } else if (s2[i] == 0) + return (+1); + + res = wide_str_coll(s1 + i, s2 + i); + if (res) + return (res); + + while ((i < maxlen) && s1[i] && s2[i]) + ++i; + + if (i >= maxlen) + break; + + if (s1[i] == 0) { + if (s2[i] == 0) { + ++i; + continue; + } else + return (-1); + } else if (s2[i] == 0) + return (+1); + else + /* NOTREACHED */ + err(2, "bwscoll error 2"); + } + + if (len1 < len2) + return (-1); + else if (len1 > len2) + return (+1); + + return (0); + } + } + } +} + +/* + * Correction of the system API + */ +double +bwstod(struct bwstring *s0, bool *empty) +{ + double ret = 0; + + if (MB_CUR_MAX == 1) { + unsigned char *end, *s; + char *ep; + + s = s0->data.cstr; + end = s + s0->len; + ep = NULL; + + while (isblank(*s) && s < end) + ++s; + + if (!isprint(*s)) { + *empty = true; + return (0); + } + + ret = strtod(s, &ep); + if ((unsigned char*) ep == s) { + *empty = true; + return (0); + } + } else { + wchar_t *end, *ep, *s; + + s = s0->data.wstr; + end = s + s0->len; + ep = NULL; + + while (iswblank(*s) && s < end) + ++s; + + if (!iswprint(*s)) { + *empty = true; + return (0); + } + + ret = wcstod(s, &ep); + if (ep == s) { + *empty = true; + return (0); + } + } + + *empty = false; + return (ret); +} + +/* + * A helper function for monthcoll. If a line matches + * a month name, it returns (number of the month - 1), + * while if there is no match, it just return -1. + */ + +int +bws_month_score(const struct bwstring *s0) +{ + + if (MB_CUR_MAX == 1) { + const unsigned char *end, *s; + size_t len; + + s = s0->data.cstr; + end = s + s0->len; + + while (isblank(*s) && s < end) + ++s; + + len = strlen(s); + + for (int i = 11; i >= 0; --i) { + if (cmonths[i] && + (s == (unsigned char*)strstr(s, cmonths[i]))) + return (i); + } + + } else { + const wchar_t *end, *s; + size_t len; + + s = s0->data.wstr; + end = s + s0->len; + + while (iswblank(*s) && s < end) + ++s; + + len = wcslen(s); + + for (int i = 11; i >= 0; --i) { + if (wmonths[i] && (s == wcsstr(s, wmonths[i]))) + return (i); + } + } + + return (-1); +} + +/* + * Rips out leading blanks (-b). + */ +struct bwstring * +ignore_leading_blanks(struct bwstring *str) +{ + + if (MB_CUR_MAX == 1) { + unsigned char *dst, *end, *src; + + src = str->data.cstr; + dst = src; + end = src + str->len; + + while (src < end && isblank(*src)) + ++src; + + if (src != dst) { + size_t newlen; + + newlen = BWSLEN(str) - (src - dst); + + while (src < end) { + *dst = *src; + ++dst; + ++src; + } + bws_setlen(str, newlen); + } + } else { + wchar_t *dst, *end, *src; + + src = str->data.wstr; + dst = src; + end = src + str->len; + + while (src < end && iswblank(*src)) + ++src; + + if (src != dst) { + + size_t newlen = BWSLEN(str) - (src - dst); + + while (src < end) { + *dst = *src; + ++dst; + ++src; + } + bws_setlen(str, newlen); + + } + } + return (str); +} + +/* + * Rips out nonprinting characters (-i). + */ +struct bwstring * +ignore_nonprinting(struct bwstring *str) +{ + size_t newlen = str->len; + + if (MB_CUR_MAX == 1) { + unsigned char *dst, *end, *src; + unsigned char c; + + src = str->data.cstr; + dst = src; + end = src + str->len; + + while (src < end) { + c = *src; + if (isprint(c)) { + *dst = c; + ++dst; + ++src; + } else { + ++src; + --newlen; + } + } + } else { + wchar_t *dst, *end, *src; + wchar_t c; + + src = str->data.wstr; + dst = src; + end = src + str->len; + + while (src < end) { + c = *src; + if (iswprint(c)) { + *dst = c; + ++dst; + ++src; + } else { + ++src; + --newlen; + } + } + } + bws_setlen(str, newlen); + + return (str); +} + +/* + * Rips out any characters that are not alphanumeric characters + * nor blanks (-d). + */ +struct bwstring * +dictionary_order(struct bwstring *str) +{ + size_t newlen = str->len; + + if (MB_CUR_MAX == 1) { + unsigned char *dst, *end, *src; + unsigned char c; + + src = str->data.cstr; + dst = src; + end = src + str->len; + + while (src < end) { + c = *src; + if (isalnum(c) || isblank(c)) { + *dst = c; + ++dst; + ++src; + } else { + ++src; + --newlen; + } + } + } else { + wchar_t *dst, *end, *src; + wchar_t c; + + src = str->data.wstr; + dst = src; + end = src + str->len; + + while (src < end) { + c = *src; + if (iswalnum(c) || iswblank(c)) { + *dst = c; + ++dst; + ++src; + } else { + ++src; + --newlen; + } + } + } + bws_setlen(str, newlen); + + return (str); +} + +/* + * Converts string to lower case(-f). + */ +struct bwstring * +ignore_case(struct bwstring *str) +{ + + if (MB_CUR_MAX == 1) { + unsigned char *end, *s; + + s = str->data.cstr; + end = s + str->len; + + while (s < end) { + *s = toupper(*s); + ++s; + } + } else { + wchar_t *end, *s; + + s = str->data.wstr; + end = s + str->len; + + while (s < end) { + *s = towupper(*s); + ++s; + } + } + return (str); +} + +void +bws_disorder_warnx(struct bwstring *s, const char *fn, size_t pos) +{ + + if (MB_CUR_MAX == 1) + warnx("%s:%zu: disorder: %s", fn, pos + 1, s->data.cstr); + else + warnx("%s:%zu: disorder: %ls", fn, pos + 1, s->data.wstr); +} diff --git a/usr.bin/sort/bwstring.h b/usr.bin/sort/bwstring.h new file mode 100644 index 000000000000..828704f5de41 --- /dev/null +++ b/usr.bin/sort/bwstring.h @@ -0,0 +1,142 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2009 Gabor Kovesdan + * Copyright (C) 2012 Oleg Moskalenko + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if !defined(__BWSTRING_H__) +#define __BWSTRING_H__ + +#include +#include +#include +#include +#include + +#include "mem.h" + +extern bool byte_sort; + +/* wchar_t is of 4 bytes: */ +#define SIZEOF_WCHAR_STRING(LEN) ((LEN)*sizeof(wchar_t)) + +/* + * Binary "wide" string + */ +struct bwstring +{ + size_t len; + union + { + wchar_t wstr[0]; + unsigned char cstr[0]; + } data; +}; + +struct reader_buffer +{ + wchar_t *fgetwln_z_buffer; + size_t fgetwln_z_buffer_size; +}; + +typedef void *bwstring_iterator; + +#define BWSLEN(s) ((s)->len) + +struct bwstring *bwsalloc(size_t sz); + +size_t bwsrawlen(const struct bwstring *bws); +const void* bwsrawdata(const struct bwstring *bws); +void bws_setlen(struct bwstring *bws, size_t newlen); +size_t bws_memsize(const struct bwstring *bws); +double bwstod(struct bwstring *s0, bool *empty); +int bws_month_score(const struct bwstring *s0); + +struct bwstring *ignore_leading_blanks(struct bwstring *str); +struct bwstring *ignore_nonprinting(struct bwstring *str); +struct bwstring *dictionary_order(struct bwstring *str); +struct bwstring *ignore_case(struct bwstring *str); + +void bwsprintf(FILE*, struct bwstring*, const char *prefix, const char *suffix); +void bws_disorder_warnx(struct bwstring *s, const char *fn, size_t pos); + +struct bwstring *bwsdup(const struct bwstring *s); +struct bwstring *bwssbdup(const wchar_t *str, size_t size); +struct bwstring *bwscsbdup(const unsigned char *str, size_t size); +void bwsfree(const struct bwstring *s); +size_t bwscpy(struct bwstring *dst, const struct bwstring *src); +struct bwstring *bwsncpy(struct bwstring *dst, const struct bwstring *src, size_t size); +struct bwstring *bwsnocpy(struct bwstring *dst, const struct bwstring *src, size_t offset, size_t size); +int bwscmp(const struct bwstring *bws1, const struct bwstring *bws2, size_t offset); +int bwsncmp(const struct bwstring *bws1, const struct bwstring *bws2, size_t offset, size_t len); +int bwscoll(const struct bwstring *bws1, const struct bwstring *bws2, size_t offset); +int bwsfwrite(struct bwstring *bws, FILE *f, bool zero_ended); +struct bwstring *bwsfgetln(FILE *file, size_t *len, bool zero_ended, struct reader_buffer *rb); + +static inline bwstring_iterator +bws_begin(struct bwstring *bws) +{ + + return (bwstring_iterator) (&(bws->data)); +} + +static inline bwstring_iterator +bws_end(struct bwstring *bws) +{ + + return ((MB_CUR_MAX == 1) ? + (bwstring_iterator) (bws->data.cstr + bws->len) : + (bwstring_iterator) (bws->data.wstr + bws->len)); +} + +static inline bwstring_iterator +bws_iterator_inc(bwstring_iterator iter, size_t pos) +{ + + if (MB_CUR_MAX == 1) + return ((unsigned char *) iter) + pos; + else + return ((wchar_t*) iter) + pos; +} + +static inline wchar_t +bws_get_iter_value(bwstring_iterator iter) +{ + + if (MB_CUR_MAX == 1) + return *((unsigned char *) iter); + else + return *((wchar_t*) iter); +} + +int +bws_iterator_cmp(bwstring_iterator iter1, bwstring_iterator iter2, size_t len); + +#define BWS_GET(bws, pos) ((MB_CUR_MAX == 1) ? ((bws)->data.cstr[(pos)]) : (bws)->data.wstr[(pos)]) + +void initialise_months(void); + +#endif /* __BWSTRING_H__ */ diff --git a/usr.bin/sort/coll.c b/usr.bin/sort/coll.c new file mode 100644 index 000000000000..0a341e42270a --- /dev/null +++ b/usr.bin/sort/coll.c @@ -0,0 +1,1307 @@ +/*- + * Copyright (C) 2009 Gabor Kovesdan + * Copyright (C) 2012 Oleg Moskalenko + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "coll.h" +#include "vsort.h" + +struct key_specs *keys = NULL; +size_t keys_num = 0; + +wchar_t symbol_decimal_point = L'.'; +/* there is no default thousands separator in collate rules: */ +wchar_t symbol_thousands_sep = 0; +wchar_t symbol_negative_sign = L'-'; +wchar_t symbol_positive_sign = L'+'; + +static int wstrcoll(struct key_value *kv1, struct key_value *kv2, size_t offset); +static int gnumcoll(struct key_value*, struct key_value *, size_t offset); +static int monthcoll(struct key_value*, struct key_value *, size_t offset); +static int numcoll(struct key_value*, struct key_value *, size_t offset); +static int hnumcoll(struct key_value*, struct key_value *, size_t offset); +static int randomcoll(struct key_value*, struct key_value *, size_t offset); +static int versioncoll(struct key_value*, struct key_value *, size_t offset); + +/* + * Allocate keys array + */ +struct keys_array * +keys_array_alloc(void) +{ + struct keys_array *ka; + size_t sz; + + sz = keys_array_size(); + ka = sort_malloc(sz); + memset(ka, 0, sz); + + return (ka); +} + +/* + * Calculate whether we need key hint space + */ +static size_t +key_hint_size(void) +{ + + return (need_hint ? sizeof(struct key_hint) : 0); +} + +/* + * Calculate keys array size + */ +size_t +keys_array_size(void) +{ + + return (keys_num * (sizeof(struct key_value) + key_hint_size())); +} + +/* + * Clean data of keys array + */ +void +clean_keys_array(const struct bwstring *s, struct keys_array *ka) +{ + + if (ka) { + for (size_t i = 0; i < keys_num; ++i) + if (ka->key[i].k && ka->key[i].k != s) + bwsfree(ka->key[i].k); + memset(ka, 0, keys_array_size()); + } +} + +/* + * Set value of a key in the keys set + */ +void +set_key_on_keys_array(struct keys_array *ka, struct bwstring *s, size_t ind) +{ + + if (ka && keys_num > ind) { + struct key_value *kv; + + kv = &(ka->key[ind]); + + if (kv->k && kv->k != s) + bwsfree(kv->k); + kv->k = s; + } +} + +/* + * Initialize a sort list item + */ +struct sort_list_item * +sort_list_item_alloc(void) +{ + struct sort_list_item *si; + size_t sz; + + sz = sizeof(struct sort_list_item) + keys_array_size(); + si = sort_malloc(sz); + memset(si, 0, sz); + + return (si); +} + +size_t +sort_list_item_size(struct sort_list_item *si) +{ + size_t ret = 0; + + if (si) { + ret = sizeof(struct sort_list_item) + keys_array_size(); + if (si->str) + ret += bws_memsize(si->str); + for (size_t i = 0; i < keys_num; ++i) { + struct key_value *kv; + + kv = &(si->ka.key[i]); + + if (kv->k != si->str) + ret += bws_memsize(kv->k); + } + } + return (ret); +} + +/* + * Calculate key for a sort list item + */ +static void +sort_list_item_make_key(struct sort_list_item *si) +{ + + preproc(si->str, &(si->ka)); +} + +/* + * Set value of a sort list item. + * Return combined string and keys memory size. + */ +void +sort_list_item_set(struct sort_list_item *si, struct bwstring *str) +{ + + if (si) { + clean_keys_array(si->str, &(si->ka)); + if (si->str) { + if (si->str == str) { + /* we are trying to reset the same string */ + return; + } else { + bwsfree(si->str); + si->str = NULL; + } + } + si->str = str; + sort_list_item_make_key(si); + } +} + +/* + * De-allocate a sort list item object memory + */ +void +sort_list_item_clean(struct sort_list_item *si) +{ + + if (si) { + clean_keys_array(si->str, &(si->ka)); + if (si->str) { + bwsfree(si->str); + si->str = NULL; + } + } +} + +/* + * Skip columns according to specs + */ +static size_t +skip_cols_to_start(const struct bwstring *s, size_t cols, size_t start, + bool skip_blanks, bool *empty_key) +{ + if (cols < 1) + return (BWSLEN(s) + 1); + + if (skip_blanks) + while (start < BWSLEN(s) && iswblank(BWS_GET(s,start))) + ++start; + + while (start < BWSLEN(s) && cols > 1) { + --cols; + ++start; + } + + if (start >= BWSLEN(s)) + *empty_key = true; + + return (start); +} + +/* + * Skip fields according to specs + */ +static size_t +skip_fields_to_start(const struct bwstring *s, size_t fields, bool *empty_field) +{ + + if (fields < 2) { + if (BWSLEN(s) == 0) + *empty_field = true; + return (0); + } else if (!(sort_opts_vals.tflag)) { + size_t cpos = 0; + bool pb = true; + + while (cpos < BWSLEN(s)) { + bool isblank; + + isblank = iswblank(BWS_GET(s,cpos)); + + if (isblank && !pb) { + --fields; + if (fields <= 1) + return (cpos); + } + pb = isblank; + ++cpos; + } + if (fields > 1) + *empty_field = true; + return (cpos); + } else { + size_t cpos = 0; + + while (cpos < BWSLEN(s)) { + if (BWS_GET(s,cpos) == sort_opts_vals.field_sep) { + --fields; + if (fields <= 1) + return (cpos + 1); + } + ++cpos; + } + if (fields > 1) + *empty_field = true; + return (cpos); + } +} + +/* + * Find fields start + */ +static void +find_field_start(const struct bwstring *s, struct key_specs *ks, + size_t *field_start, size_t *key_start, bool *empty_field, bool *empty_key) +{ + + *field_start = skip_fields_to_start(s, ks->f1, empty_field); + if (!*empty_field) + *key_start = skip_cols_to_start(s, ks->c1, *field_start, + ks->pos1b, empty_key); + else + *empty_key = true; +} + +/* + * Find end key position + */ +static size_t +find_field_end(const struct bwstring *s, struct key_specs *ks) +{ + size_t f2, next_field_start, pos_end; + bool empty_field, empty_key; + + pos_end = 0; + next_field_start = 0; + empty_field = false; + empty_key = false; + f2 = ks->f2; + + if (f2 == 0) + return (BWSLEN(s) + 1); + else { + if (ks->c2 == 0) { + next_field_start = skip_fields_to_start(s, f2 + 1, + &empty_field); + if ((next_field_start > 0) && sort_opts_vals.tflag && + (sort_opts_vals.field_sep == BWS_GET(s, + next_field_start - 1))) + --next_field_start; + } else + next_field_start = skip_fields_to_start(s, f2, + &empty_field); + } + + if (empty_field || (next_field_start >= BWSLEN(s))) + return (BWSLEN(s) + 1); + + if (ks->c2) { + pos_end = skip_cols_to_start(s, ks->c2, next_field_start, + ks->pos2b, &empty_key); + if (pos_end < BWSLEN(s)) + ++pos_end; + } else + pos_end = next_field_start; + + return (pos_end); +} + +/* + * Cut a field according to the key specs + */ +static struct bwstring * +cut_field(const struct bwstring *s, struct key_specs *ks) +{ + struct bwstring *ret = NULL; + + if (s && ks) { + size_t field_start, key_end, key_start, sz; + bool empty_field, empty_key; + + field_start = 0; + key_start = 0; + empty_field = false; + empty_key = false; + + find_field_start(s, ks, &field_start, &key_start, + &empty_field, &empty_key); + + if (empty_key) + sz = 0; + else { + key_end = find_field_end(s, ks); + sz = (key_end < key_start) ? 0 : (key_end - key_start); + } + + ret = bwsalloc(sz); + if (sz) + bwsnocpy(ret, s, key_start, sz); + } else + ret = bwsalloc(0); + + return (ret); +} + +/* + * Preprocesses a line applying the necessary transformations + * specified by command line options and returns the preprocessed + * string, which can be used to compare. + */ +int +preproc(struct bwstring *s, struct keys_array *ka) +{ + + if (sort_opts_vals.kflag) + for (size_t i = 0; i < keys_num; i++) { + struct bwstring *key; + struct key_specs *kspecs; + struct sort_mods *sm; + + kspecs = &(keys[i]); + key = cut_field(s, kspecs); + + sm = &(kspecs->sm); + if (sm->dflag) + key = dictionary_order(key); + else if (sm->iflag) + key = ignore_nonprinting(key); + if (sm->fflag || sm->Mflag) + key = ignore_case(key); + + set_key_on_keys_array(ka, key, i); + } + else { + struct bwstring *ret = NULL; + struct sort_mods *sm = default_sort_mods; + + if (sm->bflag) { + if (ret == NULL) + ret = bwsdup(s); + ret = ignore_leading_blanks(ret); + } + if (sm->dflag) { + if (ret == NULL) + ret = bwsdup(s); + ret = dictionary_order(ret); + } else if (sm->iflag) { + if (ret == NULL) + ret = bwsdup(s); + ret = ignore_nonprinting(ret); + } + if (sm->fflag || sm->Mflag) { + if (ret == NULL) + ret = bwsdup(s); + ret = ignore_case(ret); + } + if (ret == NULL) + set_key_on_keys_array(ka, s, 0); + else + set_key_on_keys_array(ka, ret, 0); + } + + return 0; +} + +cmpcoll_t +get_sort_func(struct sort_mods *sm) +{ + + if (sm->nflag) + return (numcoll); + else if (sm->hflag) + return (hnumcoll); + else if (sm->gflag) + return (gnumcoll); + else if (sm->Mflag) + return (monthcoll); + else if (sm->Rflag) + return (randomcoll); + else if (sm->Vflag) + return (versioncoll); + else + return (wstrcoll); +} + +/* + * Compares the given strings. Returns a positive number if + * the first precedes the second, a negative number if the second is + * the preceding one, and zero if they are equal. This function calls + * the underlying collate functions, which done the actual comparison. + */ +int +key_coll(struct keys_array *ps1, struct keys_array *ps2, size_t offset) +{ + struct sort_mods *sm; + int res = 0; + + for (size_t i = 0; i < keys_num; ++i) { + sm = &(keys[i].sm); + + if (sm->rflag) + res = sm->func(&(ps2->key[i]), &(ps1->key[i]), offset); + else + res = sm->func(&(ps1->key[i]), &(ps2->key[i]), offset); + + if (res) + break; + + /* offset applies to only the first key */ + offset = 0; + } + return (res); +} + +/* + * Compare two strings. + * Plain symbol-by-symbol comparison. + */ +int +top_level_str_coll(const struct bwstring *s1, const struct bwstring *s2) +{ + + if (default_sort_mods->rflag) { + const struct bwstring *tmp; + + tmp = s1; + s1 = s2; + s2 = tmp; + } + + return (bwscoll(s1, s2, 0)); +} + +/* + * Compare a string and a sort list item, according to the sort specs. + */ +int +str_list_coll(struct bwstring *str1, struct sort_list_item **ss2) +{ + struct keys_array *ka1; + int ret = 0; + + ka1 = keys_array_alloc(); + + preproc(str1, ka1); + + sort_list_item_make_key(*ss2); + + if (debug_sort) { + bwsprintf(stdout, str1, "; s1=<", ">"); + bwsprintf(stdout, (*ss2)->str, ", s2=<", ">"); + } + + ret = key_coll(ka1, &((*ss2)->ka), 0); + + if (debug_sort) + printf("; cmp1=%d", ret); + + clean_keys_array(str1, ka1); + sort_free(ka1); + + if ((ret == 0) && !(sort_opts_vals.sflag) && sort_opts_vals.complex_sort) { + ret = top_level_str_coll(str1, ((*ss2)->str)); + if (debug_sort) + printf("; cmp2=%d", ret); + } + + if (debug_sort) + printf("\n"); + + return (ret); +} + +/* + * Compare two sort list items, according to the sort specs. + */ +int +list_coll_offset(struct sort_list_item **ss1, struct sort_list_item **ss2, + size_t offset) +{ + int ret; + + ret = key_coll(&((*ss1)->ka), &((*ss2)->ka), offset); + + if (debug_sort) { + if (offset) + printf("; offset=%d", (int) offset); + bwsprintf(stdout, ((*ss1)->str), "; s1=<", ">"); + bwsprintf(stdout, ((*ss2)->str), ", s2=<", ">"); + printf("; cmp1=%d\n", ret); + } + + if (ret) + return (ret); + + if (!(sort_opts_vals.sflag) && sort_opts_vals.complex_sort) { + ret = top_level_str_coll(((*ss1)->str), ((*ss2)->str)); + if (debug_sort) + printf("; cmp2=%d\n", ret); + } + + return (ret); +} + +/* + * Compare two sort list items, according to the sort specs. + */ +int +list_coll(struct sort_list_item **ss1, struct sort_list_item **ss2) +{ + + return (list_coll_offset(ss1, ss2, 0)); +} + +#define LSCDEF(N) \ +static int \ +list_coll_##N(struct sort_list_item **ss1, struct sort_list_item **ss2) \ +{ \ + \ + return (list_coll_offset(ss1, ss2, N)); \ +} + +LSCDEF(1) +LSCDEF(2) +LSCDEF(3) +LSCDEF(4) +LSCDEF(5) +LSCDEF(6) +LSCDEF(7) +LSCDEF(8) +LSCDEF(9) +LSCDEF(10) +LSCDEF(11) +LSCDEF(12) +LSCDEF(13) +LSCDEF(14) +LSCDEF(15) +LSCDEF(16) +LSCDEF(17) +LSCDEF(18) +LSCDEF(19) +LSCDEF(20) + +listcoll_t +get_list_call_func(size_t offset) +{ + static const listcoll_t lsarray[] = { list_coll, list_coll_1, + list_coll_2, list_coll_3, list_coll_4, list_coll_5, + list_coll_6, list_coll_7, list_coll_8, list_coll_9, + list_coll_10, list_coll_11, list_coll_12, list_coll_13, + list_coll_14, list_coll_15, list_coll_16, list_coll_17, + list_coll_18, list_coll_19, list_coll_20 }; + + if (offset <= 20) + return (lsarray[offset]); + + return (list_coll); +} + +/* + * Compare two sort list items, only by their original string. + */ +int +list_coll_by_str_only(struct sort_list_item **ss1, struct sort_list_item **ss2) +{ + + return (top_level_str_coll(((*ss1)->str), ((*ss2)->str))); +} + +/* + * Maximum size of a number in the string (before or after decimal point) + */ +#define MAX_NUM_SIZE (128) + +/* + * Set suffix value + */ +static void setsuffix(wchar_t c, unsigned char *si) +{ + switch (c){ + case L'k': + case L'K': + *si = 1; + break; + case L'M': + *si = 2; + break; + case L'G': + *si = 3; + break; + case L'T': + *si = 4; + break; + case L'P': + *si = 5; + break; + case L'E': + *si = 6; + break; + case L'Z': + *si = 7; + break; + case L'Y': + *si = 8; + break; + default: + *si = 0; + }; +} + +/* + * Read string s and parse the string into a fixed-decimal-point number. + * sign equals -1 if the number is negative (explicit plus is not allowed, + * according to GNU sort's "info sort". + * The number part before decimal point is in the smain, after the decimal + * point is in sfrac, tail is the pointer to the remainder of the string. + */ +static int +read_number(struct bwstring *s0, int *sign, wchar_t *smain, int *main_len, wchar_t *sfrac, int *frac_len, unsigned char *si) +{ + bwstring_iterator s; + + s = bws_begin(s0); + + /* always end the fraction with zero, even if we have no fraction */ + sfrac[0] = 0; + + while (iswblank(bws_get_iter_value(s))) + s = bws_iterator_inc(s, 1); + + if (bws_get_iter_value(s) == symbol_negative_sign) { + *sign = -1; + s = bws_iterator_inc(s, 1); + } + + // This is '0', not '\0', do not change this + while (iswdigit(bws_get_iter_value(s)) && + (bws_get_iter_value(s) == L'0')) + s = bws_iterator_inc(s, 1); + + while (bws_get_iter_value(s) && *main_len < MAX_NUM_SIZE) { + if (iswdigit(bws_get_iter_value(s))) { + smain[*main_len] = bws_get_iter_value(s); + s = bws_iterator_inc(s, 1); + *main_len += 1; + } else if (symbol_thousands_sep && + (bws_get_iter_value(s) == symbol_thousands_sep)) + s = bws_iterator_inc(s, 1); + else + break; + } + + smain[*main_len] = 0; + + if (bws_get_iter_value(s) == symbol_decimal_point) { + s = bws_iterator_inc(s, 1); + while (iswdigit(bws_get_iter_value(s)) && + *frac_len < MAX_NUM_SIZE) { + sfrac[*frac_len] = bws_get_iter_value(s); + s = bws_iterator_inc(s, 1); + *frac_len += 1; + } + sfrac[*frac_len] = 0; + + while (*frac_len > 0 && sfrac[*frac_len - 1] == L'0') { + --(*frac_len); + sfrac[*frac_len] = L'\0'; + } + } + + setsuffix(bws_get_iter_value(s),si); + + if ((*main_len + *frac_len) == 0) + *sign = 0; + + return (0); +} + +/* + * Implements string sort. + */ +static int +wstrcoll(struct key_value *kv1, struct key_value *kv2, size_t offset) +{ + + if (debug_sort) { + if (offset) + printf("; offset=%d\n", (int) offset); + bwsprintf(stdout, kv1->k, "; k1=<", ">"); + printf("(%zu)", BWSLEN(kv1->k)); + bwsprintf(stdout, kv2->k, ", k2=<", ">"); + printf("(%zu)", BWSLEN(kv2->k)); + } + + return (bwscoll(kv1->k, kv2->k, offset)); +} + +/* + * Compare two suffixes + */ +static inline int +cmpsuffix(unsigned char si1, unsigned char si2) +{ + + return ((char)si1 - (char)si2); +} + +/* + * Implements numeric sort for -n and -h. + */ +static int +numcoll_impl(struct key_value *kv1, struct key_value *kv2, size_t offset, bool use_suffix) +{ + struct bwstring *s1, *s2; + wchar_t sfrac1[MAX_NUM_SIZE + 1], sfrac2[MAX_NUM_SIZE + 1]; + wchar_t smain1[MAX_NUM_SIZE + 1], smain2[MAX_NUM_SIZE + 1]; + int cmp_res, frac1, frac2, main1, main2, sign1, sign2; + unsigned char SI1, SI2; + bool e1, e2, key1_read, key2_read; + + s1 = kv1->k; + s2 = kv2->k; + sign1 = sign2 = 0; + main1 = main2 = 0; + frac1 = frac2 = 0; + + cmp_res = 0; + key1_read = key2_read = false; + + UNUSED_ARG(offset); + + if (debug_sort) { + bwsprintf(stdout, s1, "; k1=<", ">"); + bwsprintf(stdout, s2, ", k2=<", ">"); + } + + if (s1 == s2) + return (0); + + if (kv1->hint->status == HS_UNINITIALIZED) { + /* read the number from the string */ + read_number(s1, &sign1, smain1, &main1, sfrac1, &frac1, &SI1); + key1_read = true; + kv1->hint->v.nh.n1 = wcstoull(smain1, NULL, 10); + if(main1 < 1 && frac1 < 1) + kv1->hint->v.nh.empty=true; + kv1->hint->v.nh.si = SI1; + kv1->hint->status = (kv1->hint->v.nh.n1 != ULLONG_MAX) ? + HS_INITIALIZED : HS_ERROR; + kv1->hint->v.nh.neg = (sign1 < 0) ? true : false; + } + + if (kv2->hint->status == HS_UNINITIALIZED) { + /* read the number from the string */ + read_number(s2, &sign2, smain2, &main2, sfrac2, &frac2,&SI2); + key2_read = true; + kv2->hint->v.nh.n1 = wcstoull(smain2, NULL, 10); + if(main2 < 1 && frac2 < 1) + kv2->hint->v.nh.empty=true; + kv2->hint->v.nh.si = SI2; + kv2->hint->status = (kv2->hint->v.nh.n1 != ULLONG_MAX) ? + HS_INITIALIZED : HS_ERROR; + kv2->hint->v.nh.neg = (sign2 < 0) ? true : false; + } + + if (kv1->hint->status == HS_INITIALIZED && kv2->hint->status == + HS_INITIALIZED) { + unsigned long long n1, n2; + bool neg1, neg2; + + e1 = kv1->hint->v.nh.empty; + e2 = kv2->hint->v.nh.empty; + + if (e1 && e2) + return (0); + + neg1 = kv1->hint->v.nh.neg; + neg2 = kv2->hint->v.nh.neg; + + if (neg1 && !neg2) + return (-1); + if (neg2 && !neg1) + return (+1); + + if (e1) + return (neg2 ? +1 : -1); + else if (e2) + return (neg1 ? -1 : +1); + + + if (use_suffix) { + cmp_res = cmpsuffix(kv1->hint->v.nh.si, kv2->hint->v.nh.si); + if (cmp_res) + return (neg1 ? -cmp_res : cmp_res); + } + + n1 = kv1->hint->v.nh.n1; + n2 = kv2->hint->v.nh.n1; + if (n1 < n2) + return (neg1 ? +1 : -1); + else if (n1 > n2) + return (neg1 ? -1 : +1); + } + + /* read the numbers from the strings */ + if (!key1_read) + read_number(s1, &sign1, smain1, &main1, sfrac1, &frac1, &SI1); + if (!key2_read) + read_number(s2, &sign2, smain2, &main2, sfrac2, &frac2, &SI2); + + e1 = ((main1 + frac1) == 0); + e2 = ((main2 + frac2) == 0); + + if (e1 && e2) + return (0); + + /* we know the result if the signs are different */ + if (sign1 < 0 && sign2 >= 0) + return (-1); + if (sign1 >= 0 && sign2 < 0) + return (+1); + + if (e1) + return ((sign2 < 0) ? +1 : -1); + else if (e2) + return ((sign1 < 0) ? -1 : +1); + + if (use_suffix) { + cmp_res = cmpsuffix(SI1, SI2); + if (cmp_res) + return ((sign1 < 0) ? -cmp_res : cmp_res); + } + + /* if both numbers are empty assume that the strings are equal */ + if (main1 < 1 && main2 < 1 && frac1 < 1 && frac2 < 1) + return (0); + + /* + * if the main part is of different size, we know the result + * (because the leading zeros are removed) + */ + if (main1 < main2) + cmp_res = -1; + else if (main1 > main2) + cmp_res = +1; + /* if the sizes are equal then simple non-collate string compare gives the correct result */ + else + cmp_res = wcscmp(smain1, smain2); + + /* check fraction */ + if (!cmp_res) + cmp_res = wcscmp(sfrac1, sfrac2); + + if (!cmp_res) + return (0); + + /* reverse result if the signs are negative */ + if (sign1 < 0 && sign2 < 0) + cmp_res = -cmp_res; + + return (cmp_res); +} + +/* + * Implements numeric sort (-n). + */ +static int +numcoll(struct key_value *kv1, struct key_value *kv2, size_t offset) +{ + + return (numcoll_impl(kv1, kv2, offset, false)); +} + +/* + * Implements 'human' numeric sort (-h). + */ +static int +hnumcoll(struct key_value *kv1, struct key_value *kv2, size_t offset) +{ + + return (numcoll_impl(kv1, kv2, offset, true)); +} + +/* + * Implements random sort (-R). + */ +static int +randomcoll(struct key_value *kv1, struct key_value *kv2, size_t offset) +{ + struct bwstring *s1, *s2; + MD5_CTX ctx1, ctx2; + char *b1, *b2; + + UNUSED_ARG(offset); + + s1 = kv1->k; + s2 = kv2->k; + + if (debug_sort) { + bwsprintf(stdout, s1, "; k1=<", ">"); + bwsprintf(stdout, s2, ", k2=<", ">"); + } + + if (s1 == s2) + return (0); + + memcpy(&ctx1,&md5_ctx,sizeof(MD5_CTX)); + memcpy(&ctx2,&md5_ctx,sizeof(MD5_CTX)); + + MD5Update(&ctx1, bwsrawdata(s1), bwsrawlen(s1)); + MD5Update(&ctx2, bwsrawdata(s2), bwsrawlen(s2)); + b1 = MD5End(&ctx1, NULL); + b2 = MD5End(&ctx2, NULL); + if (b1 == NULL) { + if (b2 == NULL) + return (0); + else { + sort_free(b2); + return (-1); + } + } else if (b2 == NULL) { + sort_free(b1); + return (+1); + } else { + int cmp_res; + + cmp_res = strcmp(b1,b2); + sort_free(b1); + sort_free(b2); + + if (!cmp_res) + cmp_res = bwscoll(s1, s2, 0); + + return (cmp_res); + } +} + +/* + * Implements version sort (-V). + */ +static int +versioncoll(struct key_value *kv1, struct key_value *kv2, size_t offset) +{ + struct bwstring *s1, *s2; + + UNUSED_ARG(offset); + + s1 = kv1->k; + s2 = kv2->k; + + if (debug_sort) { + bwsprintf(stdout, s1, "; k1=<", ">"); + bwsprintf(stdout, s2, ", k2=<", ">"); + } + + if (s1 == s2) + return (0); + + return (vcmp(s1, s2)); +} + +/* + * Check for minus infinity + */ +static inline bool +huge_minus(double d, int err1) +{ + + if (err1 == ERANGE) + if (d == -HUGE_VAL || d == -HUGE_VALF || d == -HUGE_VALL) + return (+1); + + return (0); +} + +/* + * Check for plus infinity + */ +static inline bool +huge_plus(double d, int err1) +{ + + if (err1 == ERANGE) + if (d == HUGE_VAL || d == HUGE_VALF || d == HUGE_VALL) + return (+1); + + return (0); +} + +/* + * Check whether a function is a NAN + */ +static bool +is_nan(double d) +{ + + return ((d == NAN) || (isnan(d))); +} + +/* + * Compare two NANs + */ +static int +cmp_nans(double d1, double d2) +{ + + if (d1 < d2) + return (-1); + if (d2 > d2) + return (+1); + return (0); +} + +/* + * Implements general numeric sort (-g). + */ +static int +gnumcoll(struct key_value *kv1, struct key_value *kv2, size_t offset) +{ + double d1, d2; + int err1, err2; + bool empty1, empty2, key1_read, key2_read; + + d1 = d2 = 0; + err1 = err2 = 0; + key1_read = key2_read = false; + + UNUSED_ARG(offset); + + if (debug_sort) { + bwsprintf(stdout, kv1->k, "; k1=<", ">"); + bwsprintf(stdout, kv2->k, "; k2=<", ">"); + } + + if (kv1->hint->status == HS_UNINITIALIZED) { + errno = 0; + d1 = bwstod(kv1->k, &empty1); + err1 = errno; + + if (empty1) + kv1->hint->v.gh.notnum = true; + else if (err1 == 0) { + kv1->hint->v.gh.d = d1; + kv1->hint->v.gh.nan = is_nan(d1); + kv1->hint->status = HS_INITIALIZED; + } else + kv1->hint->status = HS_ERROR; + + key1_read = true; + } + + if (kv2->hint->status == HS_UNINITIALIZED) { + errno = 0; + d2 = bwstod(kv2->k, &empty2); + err2 = errno; + + if (empty2) + kv2->hint->v.gh.notnum = true; + else if (err2 == 0) { + kv2->hint->v.gh.d = d2; + kv2->hint->v.gh.nan = is_nan(d2); + kv2->hint->status = HS_INITIALIZED; + } else + kv2->hint->status = HS_ERROR; + + key2_read = true; + } + + if (kv1->hint->status == HS_INITIALIZED && + kv2->hint->status == HS_INITIALIZED) { + if (kv1->hint->v.gh.notnum) + return ((kv2->hint->v.gh.notnum) ? 0 : -1); + else if (kv2->hint->v.gh.notnum) + return (+1); + + if (kv1->hint->v.gh.nan) + return ((kv2->hint->v.gh.nan) ? + cmp_nans(kv1->hint->v.gh.d, kv2->hint->v.gh.d) : + -1); + else if (kv2->hint->v.gh.nan) + return (+1); + + d1 = kv1->hint->v.gh.d; + d2 = kv2->hint->v.gh.d; + + if (d1 < d2) + return (-1); + else if (d1 > d2) + return (+1); + else + return (0); + } + + if (!key1_read) { + errno = 0; + d1 = bwstod(kv1->k, &empty1); + err1 = errno; + } + + if (!key2_read) { + errno = 0; + d2 = bwstod(kv2->k, &empty2); + err2 = errno; + } + + /* Non-value case: */ + if (empty1) + return (empty2 ? 0 : -1); + else if (empty2) + return (+1); + + /* NAN case */ + if (is_nan(d1)) + return (is_nan(d2) ? cmp_nans(d1, d2) : -1); + else if (is_nan(d2)) + return (+1); + + /* Infinities */ + if (err1 == ERANGE || err2 == ERANGE) { + /* Minus infinity case */ + if (huge_minus(d1, err1)) { + if (huge_minus(d2, err2)) { + if (d1 < d2) + return (-1); + if (d1 > d2) + return (+1); + return (0); + } else + return (-1); + + } else if (huge_minus(d2, err2)) { + if (huge_minus(d1, err1)) { + if (d1 < d2) + return (-1); + if (d1 > d2) + return (+1); + return (0); + } else + return (+1); + } + + /* Plus infinity case */ + if (huge_plus(d1, err1)) { + if (huge_plus(d2, err2)) { + if (d1 < d2) + return (-1); + if (d1 > d2) + return (+1); + return (0); + } else + return (+1); + } else if (huge_plus(d2, err2)) { + if (huge_plus(d1, err1)) { + if (d1 < d2) + return (-1); + if (d1 > d2) + return (+1); + return (0); + } else + return (-1); + } + } + + if (d1 < d2) + return (-1); + if (d1 > d2) + return (+1); + + return (0); +} + +/* + * Implements month sort (-M). + */ +static int +monthcoll(struct key_value *kv1, struct key_value *kv2, size_t offset) +{ + int val1, val2; + bool key1_read, key2_read; + + val1 = val2 = 0; + key1_read = key2_read = false; + + UNUSED_ARG(offset); + + if (debug_sort) { + bwsprintf(stdout, kv1->k, "; k1=<", ">"); + bwsprintf(stdout, kv2->k, "; k2=<", ">"); + } + + if (kv1->hint->status == HS_UNINITIALIZED) { + kv1->hint->v.Mh.m = bws_month_score(kv1->k); + key1_read = true; + kv1->hint->status = HS_INITIALIZED; + } + + if (kv2->hint->status == HS_UNINITIALIZED) { + kv2->hint->v.Mh.m = bws_month_score(kv2->k); + key2_read = true; + kv2->hint->status = HS_INITIALIZED; + } + + if (kv1->hint->status == HS_INITIALIZED) { + val1 = kv1->hint->v.Mh.m; + key1_read = true; + } + + if (kv2->hint->status == HS_INITIALIZED) { + val2 = kv2->hint->v.Mh.m; + key2_read = true; + } + + if (!key1_read) + val1 = bws_month_score(kv1->k); + if (!key2_read) + val2 = bws_month_score(kv2->k); + + if (val1 == val2) { + return (0); + } + if (val1 < val2) + return (-1); + return (+1); +} diff --git a/usr.bin/sort/coll.h b/usr.bin/sort/coll.h new file mode 100644 index 000000000000..c020e74a8244 --- /dev/null +++ b/usr.bin/sort/coll.h @@ -0,0 +1,167 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2009 Gabor Kovesdan + * Copyright (C) 2012 Oleg Moskalenko + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if !defined(__COLL_H__) +#define __COLL_H__ + +#include "bwstring.h" +#include "sort.h" + +/* + * Sort hint data for -n + */ +struct n_hint +{ + unsigned long long n1; + unsigned char si; + bool empty; + bool neg; +}; + +/* + * Sort hint data for -g + */ +struct g_hint +{ + double d; + bool nan; + bool notnum; +}; + +/* + * Sort hint data for -M + */ +struct M_hint +{ + int m; +}; + +/* + * Status of a sort hint object + */ +typedef enum +{ + HS_ERROR = -1, HS_UNINITIALIZED = 0, HS_INITIALIZED = 1 +} hint_status; + +/* + * Sort hint object + */ +struct key_hint +{ + hint_status status; + union + { + struct n_hint nh; + struct g_hint gh; + struct M_hint Mh; + } v; +}; + +/* + * Key value + */ +struct key_value +{ + struct bwstring *k; /* key string */ + struct key_hint hint[0]; /* key sort hint */ +}; + +/* + * Set of keys container object. + */ +struct keys_array +{ + struct key_value key[0]; +}; + +/* + * Parsed -k option data + */ +struct key_specs +{ + struct sort_mods sm; + size_t c1; + size_t c2; + size_t f1; + size_t f2; + bool pos1b; + bool pos2b; +}; + +/* + * Single entry in sort list. + */ +struct sort_list_item +{ + struct bwstring *str; + struct keys_array ka; +}; + +/* + * Function type, used to compare two list objects + */ +typedef int (*listcoll_t)(struct sort_list_item **ss1, struct sort_list_item **ss2); + +extern struct key_specs *keys; +extern size_t keys_num; + +/* + * Main localised symbols + */ +extern wchar_t symbol_decimal_point; +extern wchar_t symbol_thousands_sep; +extern wchar_t symbol_negative_sign; +extern wchar_t symbol_positive_sign; + +/* funcs */ + +cmpcoll_t get_sort_func(struct sort_mods *sm); + +struct keys_array *keys_array_alloc(void); +size_t keys_array_size(void); +void set_key_on_keys_array(struct keys_array *ka, struct bwstring *s, size_t ind); +void clean_keys_array(const struct bwstring *s, struct keys_array *ka); + +struct sort_list_item *sort_list_item_alloc(void); +void sort_list_item_set(struct sort_list_item *si, struct bwstring *str); +void sort_list_item_clean(struct sort_list_item *si); +size_t sort_list_item_size(struct sort_list_item *si); + +int preproc(struct bwstring *s, struct keys_array *ka); +int top_level_str_coll(const struct bwstring *, const struct bwstring *); +int key_coll(struct keys_array *ks1, struct keys_array *ks2, size_t offset); +int str_list_coll(struct bwstring *str1, struct sort_list_item **ss2); +int list_coll_by_str_only(struct sort_list_item **ss1, struct sort_list_item **ss2); +int list_coll(struct sort_list_item **ss1, struct sort_list_item **ss2); +int list_coll_offset(struct sort_list_item **ss1, struct sort_list_item **ss2, size_t offset); + +listcoll_t get_list_call_func(size_t offset); + +#endif /* __COLL_H__ */ diff --git a/usr.bin/sort/file.c b/usr.bin/sort/file.c new file mode 100644 index 000000000000..6eb332a7fb34 --- /dev/null +++ b/usr.bin/sort/file.c @@ -0,0 +1,1615 @@ +/*- + * Copyright (C) 2009 Gabor Kovesdan + * Copyright (C) 2012 Oleg Moskalenko + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#if defined(SORT_THREADS) +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +#include "coll.h" +#include "file.h" +#include "radixsort.h" + +unsigned long long free_memory = 1000000; +unsigned long long available_free_memory = 1000000; + +const char *tmpdir = "/var/tmp"; +const char* compress_program = NULL; + +size_t max_open_files = 16; + +/* + * How much space we read from file at once + */ +#define READ_CHUNK (4096) + +/* + * File reader structure + */ +struct file_reader +{ + struct reader_buffer rb; + FILE *file; + char *fname; + unsigned char *buffer; + unsigned char *mmapaddr; + unsigned char *mmapptr; + size_t bsz; + size_t cbsz; + size_t mmapsize; + size_t strbeg; + int fd; + char elsymb; +}; + +/* + * Structure to be used in file merge process. + */ +struct file_header +{ + struct file_reader *fr; + struct sort_list_item *si; /* current top line */ + size_t file_pos; +}; + +/* + * List elements of "cleanable" files list. + */ +struct CLEANABLE_FILE +{ + char *fn; + LIST_ENTRY(CLEANABLE_FILE) files; +}; + +/* + * List header of "cleanable" files list. + */ +static LIST_HEAD(CLEANABLE_FILES,CLEANABLE_FILE) tmp_files; + +/* + * Semaphore to protect the tmp file list. + * We use semaphore here because it is signal-safe, according to POSIX. + * And semaphore does not require pthread library. + */ +static sem_t tmp_files_sem; + +static void mt_sort(struct sort_list *list, + int (*sort_func)(void *, size_t, size_t, + int (*)(const void *, const void *)), const char* fn); + +/* + * Init tmp files list + */ +void +init_tmp_files(void) +{ + + LIST_INIT(&tmp_files); + sem_init(&tmp_files_sem, 0, 1); +} + +/* + * Save name of a tmp file for signal cleanup + */ +void +tmp_file_atexit(const char *tmp_file) +{ + + if (tmp_file) { + sem_wait(&tmp_files_sem); + struct CLEANABLE_FILE *item = + sort_malloc(sizeof(struct CLEANABLE_FILE)); + item->fn = sort_strdup(tmp_file); + LIST_INSERT_HEAD(&tmp_files, item, files); + sem_post(&tmp_files_sem); + } +} + +/* + * Clear tmp files + */ +void +clear_tmp_files(void) +{ + struct CLEANABLE_FILE *item; + + sem_wait(&tmp_files_sem); + LIST_FOREACH(item,&tmp_files,files) { + if ((item) && (item->fn)) + unlink(item->fn); + } + sem_post(&tmp_files_sem); +} + +/* + * Check whether a file is a temporary file + */ +static bool +file_is_tmp(const char* fn) +{ + struct CLEANABLE_FILE *item; + bool ret = false; + + if (fn) { + sem_wait(&tmp_files_sem); + LIST_FOREACH(item,&tmp_files,files) { + if ((item) && (item->fn)) + if (strcmp(item->fn, fn) == 0) { + ret = true; + break; + } + } + sem_post(&tmp_files_sem); + } + + return (ret); +} + +/* + * Read zero-terminated line from a file + */ +char * +read_file0_line(struct file0_reader *f0r) +{ + size_t pos = 0; + int c; + + if ((f0r->f == NULL) || feof(f0r->f)) + return (NULL); + + if (f0r->current_line && f0r->current_sz > 0) + f0r->current_line[0] = 0; + + while (!feof(f0r->f)) { + c = fgetc(f0r->f); + if (feof(f0r->f) || (c == -1)) + break; + if ((pos + 1) >= f0r->current_sz) { + size_t newsz = (f0r->current_sz + 2) * 2; + f0r->current_line = sort_realloc(f0r->current_line, + newsz); + f0r->current_sz = newsz; + } + f0r->current_line[pos] = (char)c; + if (c == 0) + break; + else + f0r->current_line[pos + 1] = 0; + ++pos; + } + + return f0r->current_line; +} + +/* + * Generate new temporary file name + */ +char * +new_tmp_file_name(void) +{ + static unsigned int tfcounter = 0; + static const char *fn = ".bsdsort."; + char *ret; + size_t sz; + + sz = strlen(tmpdir) + 1 + strlen(fn) + 32 + 1; + ret = sort_malloc(sz); + + sprintf(ret, "%s/%s%d.%u", tmpdir, fn, (int) getpid(), tfcounter++); + tmp_file_atexit(ret); + return (ret); +} + +/* + * Initialize file list + */ +void +file_list_init(struct file_list *fl, bool tmp) +{ + + if (fl) { + fl->count = 0; + fl->sz = 0; + fl->fns = NULL; + fl->tmp = tmp; + } +} + +/* + * Add a file name to the list + */ +void +file_list_add(struct file_list *fl, char *fn, bool allocate) +{ + + if (fl && fn) { + if (fl->count >= fl->sz || (fl->fns == NULL)) { + fl->sz = (fl->sz) * 2 + 1; + fl->fns = sort_realloc(fl->fns, fl->sz * + sizeof(char *)); + } + fl->fns[fl->count] = allocate ? sort_strdup(fn) : fn; + fl->count += 1; + } +} + +/* + * Populate file list from array of file names + */ +void +file_list_populate(struct file_list *fl, int argc, char **argv, bool allocate) +{ + + if (fl && argv) { + int i; + + for (i = 0; i < argc; i++) + file_list_add(fl, argv[i], allocate); + } +} + +/* + * Clean file list data and delete the files, + * if this is a list of temporary files + */ +void +file_list_clean(struct file_list *fl) +{ + + if (fl) { + if (fl->fns) { + int i; + + for (i = 0; i < fl->count; i++) { + if (fl->fns[i]) { + if (fl->tmp) + unlink(fl->fns[i]); + sort_free(fl->fns[i]); + fl->fns[i] = 0; + } + } + sort_free(fl->fns); + fl->fns = NULL; + } + fl->sz = 0; + fl->count = 0; + fl->tmp = false; + } +} + +/* + * Init sort list + */ +void +sort_list_init(struct sort_list *l) +{ + + if (l) { + l->count = 0; + l->size = 0; + l->memsize = sizeof(struct sort_list); + l->list = NULL; + } +} + +/* + * Add string to sort list + */ +void +sort_list_add(struct sort_list *l, struct bwstring *str) +{ + + if (l && str) { + size_t indx = l->count; + + if ((l->list == NULL) || (indx >= l->size)) { + int newsize = (l->size + 1) + 1024; + + l->list = sort_realloc(l->list, + sizeof(struct sort_list_item*) * newsize); + l->memsize += (newsize - l->size) * + sizeof(struct sort_list_item*); + l->size = newsize; + } + l->list[indx] = sort_list_item_alloc(); + sort_list_item_set(l->list[indx], str); + l->memsize += sort_list_item_size(l->list[indx]); + l->count += 1; + } +} + +/* + * Clean sort list data + */ +void +sort_list_clean(struct sort_list *l) +{ + + if (l) { + if (l->list) { + size_t i; + + for (i = 0; i < l->count; i++) { + struct sort_list_item *item; + + item = l->list[i]; + + if (item) { + sort_list_item_clean(item); + sort_free(item); + l->list[i] = NULL; + } + } + sort_free(l->list); + l->list = NULL; + } + l->count = 0; + l->size = 0; + l->memsize = sizeof(struct sort_list); + } +} + +/* + * Write sort list to file + */ +void +sort_list_dump(struct sort_list *l, const char *fn) +{ + + if (l && fn) { + FILE *f; + + f = openfile(fn, "w"); + if (f == NULL) + err(2, NULL); + + if (l->list) { + struct sort_list_item *last_printed_item; + size_t i; + + last_printed_item = NULL; + + for (i = 0; i < l->count; i++) { + struct sort_list_item *item; + + item = l->list[i]; + + if (!(sort_opts_vals.uflag) || + (last_printed_item == NULL) || + list_coll(&last_printed_item, &item)) { + bwsfwrite(item->str, f, + sort_opts_vals.zflag); + if (sort_opts_vals.uflag) + last_printed_item = item; + } + } + } + + closefile(f, fn); + } +} + +/* + * Checks if the given file is sorted. Stops at the first disorder, + * prints the disordered line and returns 1. + */ +int +check(const char *fn) +{ + struct bwstring *s1, *s2, *s1disorder, *s2disorder; + struct file_reader *fr; + struct keys_array *ka1, *ka2; + int res, pos, posdisorder; + + s1 = s2 = s1disorder = s2disorder = NULL; + ka1 = ka2 = NULL; + + fr = file_reader_init(fn); + + res = 0; + pos = 1; + posdisorder = 1; + + if (fr == NULL) { + err(2, NULL); + goto end; + } + + s1 = file_reader_readline(fr); + if (s1 == NULL) + goto end; + + ka1 = keys_array_alloc(); + preproc(s1, ka1); + + s2 = file_reader_readline(fr); + if (s2 == NULL) + goto end; + + ka2 = keys_array_alloc(); + preproc(s2, ka2); + + for (;;) { + + if (debug_sort) { + bwsprintf(stdout, s2, "s1=<", ">"); + bwsprintf(stdout, s1, "s2=<", ">"); + } + int cmp = key_coll(ka2, ka1, 0); + if (debug_sort) + printf("; cmp1=%d", cmp); + + if (!cmp && sort_opts_vals.complex_sort && + !(sort_opts_vals.uflag) && !(sort_opts_vals.sflag)) { + cmp = top_level_str_coll(s2, s1); + if (debug_sort) + printf("; cmp2=%d", cmp); + } + if (debug_sort) + printf("\n"); + + if ((sort_opts_vals.uflag && (cmp <= 0)) || (cmp < 0)) { + if (!(sort_opts_vals.csilentflag)) { + s2disorder = bwsdup(s2); + posdisorder = pos; + if (debug_sort) + s1disorder = bwsdup(s1); + } + res = 1; + goto end; + } + + pos++; + + clean_keys_array(s1, ka1); + sort_free(ka1); + ka1 = ka2; + ka2 = NULL; + + bwsfree(s1); + s1 = s2; + + s2 = file_reader_readline(fr); + if (s2 == NULL) + goto end; + + ka2 = keys_array_alloc(); + preproc(s2, ka2); + } + +end: + if (ka1) { + clean_keys_array(s1, ka1); + sort_free(ka1); + } + + if (s1) + bwsfree(s1); + + if (ka2) { + clean_keys_array(s2, ka2); + sort_free(ka2); + } + + if (s2) + bwsfree(s2); + + if ((fn == NULL) || (*fn == 0) || (strcmp(fn, "-") == 0)) { + for (;;) { + s2 = file_reader_readline(fr); + if (s2 == NULL) + break; + bwsfree(s2); + } + } + + file_reader_free(fr); + + if (s2disorder) { + bws_disorder_warnx(s2disorder, fn, posdisorder); + if (s1disorder) { + bws_disorder_warnx(s1disorder, fn, posdisorder); + if (s1disorder != s2disorder) + bwsfree(s1disorder); + } + bwsfree(s2disorder); + s1disorder = NULL; + s2disorder = NULL; + } + + if (res) + exit(res); + + return (0); +} + +/* + * Opens a file. If the given filename is "-", stdout will be + * opened. + */ +FILE * +openfile(const char *fn, const char *mode) +{ + FILE *file; + + if (strcmp(fn, "-") == 0) { + return ((mode && mode[0] == 'r') ? stdin : stdout); + } else { + mode_t orig_file_mask = 0; + int is_tmp = file_is_tmp(fn); + + if (is_tmp && (mode[0] == 'w')) + orig_file_mask = umask(S_IWGRP | S_IWOTH | + S_IRGRP | S_IROTH); + + if (is_tmp && (compress_program != NULL)) { + char *cmd; + size_t cmdsz; + + cmdsz = strlen(fn) + 128; + cmd = sort_malloc(cmdsz); + + fflush(stdout); + + if (mode[0] == 'r') + snprintf(cmd, cmdsz - 1, "cat %s | %s -d", + fn, compress_program); + else if (mode[0] == 'w') + snprintf(cmd, cmdsz - 1, "%s > %s", + compress_program, fn); + else + err(2, getstr(7)); + + if ((file = popen(cmd, mode)) == NULL) + err(2, NULL); + + sort_free(cmd); + + } else + if ((file = fopen(fn, mode)) == NULL) + err(2, NULL); + + if (is_tmp && (mode[0] == 'w')) + umask(orig_file_mask); + } + + return (file); +} + +/* + * Close file + */ +void +closefile(FILE *f, const char *fn) +{ + if (f == NULL) { + ; + } else if (f == stdin) { + ; + } else if (f == stdout) { + fflush(f); + } else { + if (file_is_tmp(fn) && compress_program != NULL) { + if(pclose(f)<0) + err(2,NULL); + } else + fclose(f); + } +} + +/* + * Reads a file into the internal buffer. + */ +struct file_reader * +file_reader_init(const char *fsrc) +{ + struct file_reader *ret; + + if (fsrc == NULL) + fsrc = "-"; + + ret = sort_malloc(sizeof(struct file_reader)); + memset(ret, 0, sizeof(struct file_reader)); + + ret->elsymb = '\n'; + if (sort_opts_vals.zflag) + ret->elsymb = 0; + + ret->fname = sort_strdup(fsrc); + + if (strcmp(fsrc, "-") && (compress_program == NULL)) { + + do { + struct stat stat_buf; + void *addr; + size_t sz = 0; + int fd, flags; + + flags = MAP_NOCORE | MAP_NOSYNC; + addr = MAP_FAILED; + + fd = open(fsrc, O_RDONLY); + if (fd < 0) + err(2, NULL); + + if (fstat(fd, &stat_buf) < 0) { + close(fd); + break; + } + + sz = stat_buf.st_size; + +#if defined(MAP_PREFAULT_READ) + flags |= MAP_PREFAULT_READ; +#endif + + addr = mmap(NULL, sz, PROT_READ, flags, fd, 0); + if (addr == MAP_FAILED) { + close(fd); + break; + } + + ret->fd = fd; + ret->mmapaddr = addr; + ret->mmapsize = sz; + ret->mmapptr = ret->mmapaddr; + + } while (0); + } + + if (ret->mmapaddr == NULL) { + ret->file = openfile(fsrc, "r"); + if (ret->file == NULL) + err(2, NULL); + + if (strcmp(fsrc, "-")) { + ret->cbsz = READ_CHUNK; + ret->buffer = sort_malloc(ret->cbsz); + ret->bsz = 0; + ret->strbeg = 0; + + ret->bsz = fread(ret->buffer, 1, ret->cbsz, ret->file); + if (ret->bsz == 0) { + if (ferror(ret->file)) + err(2, NULL); + } + } + } + + return (ret); +} + +struct bwstring * +file_reader_readline(struct file_reader *fr) +{ + struct bwstring *ret = NULL; + + if (fr->mmapaddr) { + unsigned char *mmapend; + + mmapend = fr->mmapaddr + fr->mmapsize; + if (fr->mmapptr >= mmapend) + return (NULL); + else { + unsigned char *strend; + size_t sz; + + sz = mmapend - fr->mmapptr; + strend = memchr(fr->mmapptr, fr->elsymb, sz); + + if (strend == NULL) { + ret = bwscsbdup(fr->mmapptr, sz); + fr->mmapptr = mmapend; + } else { + ret = bwscsbdup(fr->mmapptr, strend - + fr->mmapptr); + fr->mmapptr = strend + 1; + } + } + + } else if (fr->file != stdin) { + unsigned char *strend; + size_t bsz1, remsz, search_start; + + search_start = 0; + remsz = 0; + strend = NULL; + + if (fr->bsz > fr->strbeg) + remsz = fr->bsz - fr->strbeg; + + /* line read cycle */ + for (;;) { + if (remsz > search_start) + strend = memchr(fr->buffer + fr->strbeg + + search_start, fr->elsymb, remsz - + search_start); + else + strend = NULL; + + if (strend) + break; + if (feof(fr->file)) + break; + + if (fr->bsz != fr->cbsz) + /* NOTREACHED */ + err(2, "File read software error 1"); + + if (remsz > (READ_CHUNK >> 1)) { + search_start = fr->cbsz - fr->strbeg; + fr->cbsz += READ_CHUNK; + fr->buffer = sort_realloc(fr->buffer, + fr->cbsz); + bsz1 = fread(fr->buffer + fr->bsz, 1, + READ_CHUNK, fr->file); + if (bsz1 == 0) { + if (ferror(fr->file)) + err(2, NULL); + break; + } + fr->bsz += bsz1; + remsz += bsz1; + } else { + if (remsz > 0 && fr->strbeg>0) + bcopy(fr->buffer + fr->strbeg, + fr->buffer, remsz); + + fr->strbeg = 0; + search_start = remsz; + bsz1 = fread(fr->buffer + remsz, 1, + fr->cbsz - remsz, fr->file); + if (bsz1 == 0) { + if (ferror(fr->file)) + err(2, NULL); + break; + } + fr->bsz = remsz + bsz1; + remsz = fr->bsz; + } + } + + if (strend == NULL) + strend = fr->buffer + fr->bsz; + + if ((fr->buffer + fr->strbeg <= strend) && + (fr->strbeg < fr->bsz) && (remsz>0)) + ret = bwscsbdup(fr->buffer + fr->strbeg, strend - + fr->buffer - fr->strbeg); + + fr->strbeg = (strend - fr->buffer) + 1; + + } else { + size_t len = 0; + + ret = bwsfgetln(fr->file, &len, sort_opts_vals.zflag, + &(fr->rb)); + } + + return (ret); +} + +static void +file_reader_clean(struct file_reader *fr) +{ + + if (fr) { + if (fr->mmapaddr) + munmap(fr->mmapaddr, fr->mmapsize); + + if (fr->fd) + close(fr->fd); + + if (fr->buffer) + sort_free(fr->buffer); + + if (fr->file) + if (fr->file != stdin) + closefile(fr->file, fr->fname); + + if(fr->fname) + sort_free(fr->fname); + + memset(fr, 0, sizeof(struct file_reader)); + } +} + +void +file_reader_free(struct file_reader *fr) +{ + + if (fr) { + file_reader_clean(fr); + sort_free(fr); + } +} + +int +procfile(const char *fsrc, struct sort_list *list, struct file_list *fl) +{ + struct file_reader *fr; + + fr = file_reader_init(fsrc); + if (fr == NULL) + err(2, NULL); + + /* file browse cycle */ + for (;;) { + struct bwstring *bws; + + bws = file_reader_readline(fr); + + if (bws == NULL) + break; + + sort_list_add(list, bws); + + if (list->memsize >= available_free_memory) { + char *fn; + + fn = new_tmp_file_name(); + sort_list_to_file(list, fn); + file_list_add(fl, fn, false); + sort_list_clean(list); + } + } + + file_reader_free(fr); + + return (0); +} + +/* + * Compare file headers. Files with EOF always go to the end of the list. + */ +static int +file_header_cmp(struct file_header *f1, struct file_header *f2) +{ + + if (f1 == f2) + return (0); + else { + if (f1->fr == NULL) { + return ((f2->fr == NULL) ? 0 : +1); + } else if (f2->fr == NULL) + return (-1); + else { + int ret; + + ret = list_coll(&(f1->si), &(f2->si)); + if (!ret) + return ((f1->file_pos < f2->file_pos) ? -1 : +1); + return (ret); + } + } +} + +/* + * Allocate and init file header structure + */ +static void +file_header_init(struct file_header **fh, const char *fn, size_t file_pos) +{ + + if (fh && fn) { + struct bwstring *line; + + *fh = sort_malloc(sizeof(struct file_header)); + (*fh)->file_pos = file_pos; + (*fh)->fr = file_reader_init(fn); + if ((*fh)->fr == NULL) { + perror(fn); + err(2, getstr(8)); + } + line = file_reader_readline((*fh)->fr); + if (line == NULL) { + file_reader_free((*fh)->fr); + (*fh)->fr = NULL; + (*fh)->si = NULL; + } else { + (*fh)->si = sort_list_item_alloc(); + sort_list_item_set((*fh)->si, line); + } + } +} + +/* + * Close file + */ +static void +file_header_close(struct file_header **fh) +{ + + if (fh && *fh) { + if ((*fh)->fr) { + file_reader_free((*fh)->fr); + (*fh)->fr = NULL; + } + if ((*fh)->si) { + sort_list_item_clean((*fh)->si); + sort_free((*fh)->si); + (*fh)->si = NULL; + } + sort_free(*fh); + *fh = NULL; + } +} + +/* + * Swap two array elements + */ +static void +file_header_swap(struct file_header **fh, int i1, int i2) +{ + struct file_header *tmp; + + tmp = fh[i1]; + fh[i1] = fh[i2]; + fh[i2] = tmp; +} + +/* heap algorithm ==>> */ + +/* + * See heap sort algorithm + * "Raises" last element to its right place + */ +static void +file_header_heap_swim(struct file_header **fh, int indx) +{ + + if (indx > 0) { + int parent_index; + + parent_index = (indx - 1) >> 1; + + if (file_header_cmp(fh[indx], fh[parent_index]) < 0) { + /* swap child and parent and continue */ + file_header_swap(fh, indx, parent_index); + file_header_heap_swim(fh, parent_index); + } + } +} + +/* + * Sink the top element to its correct position + */ +static void +file_header_heap_sink(struct file_header **fh, int indx, int size) +{ + int left_child_index; + int right_child_index; + + left_child_index = indx + indx + 1; + right_child_index = left_child_index + 1; + + if (left_child_index < size) { + int min_child_index; + + min_child_index = left_child_index; + + if ((right_child_index < size) && + (file_header_cmp(fh[left_child_index], + fh[right_child_index]) > 0)) + min_child_index = right_child_index; + if (file_header_cmp(fh[indx], fh[min_child_index]) > 0) { + file_header_swap(fh, indx, min_child_index); + file_header_heap_sink(fh, min_child_index, size); + } + } +} + +/* <<== heap algorithm */ + +/* + * Adds element to the "left" end + */ +static void +file_header_list_rearrange_from_header(struct file_header **fh, int size) +{ + + file_header_heap_sink(fh, 0, size); +} + +/* + * Adds element to the "right" end + */ +static void +file_header_list_push(struct file_header *f, struct file_header **fh, int size) +{ + + fh[size++] = f; + file_header_heap_swim(fh, size - 1); +} + +struct last_printed +{ + struct bwstring *str; +}; + +/* + * Prints the current line of the file + */ +static void +file_header_print(struct file_header *fh, FILE *f_out, struct last_printed *lp) +{ + + if (fh && fh->fr && f_out && fh->si && fh->si->str) { + if (sort_opts_vals.uflag) { + if ((lp->str == NULL) || (str_list_coll(lp->str, &(fh->si)))) { + bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag); + if (lp->str) + bwsfree(lp->str); + lp->str = bwsdup(fh->si->str); + } + } else + bwsfwrite(fh->si->str, f_out, sort_opts_vals.zflag); + } +} + +/* + * Read next line + */ +static void +file_header_read_next(struct file_header *fh) +{ + + if (fh && fh->fr) { + struct bwstring *tmp; + + tmp = file_reader_readline(fh->fr); + if (tmp == NULL) { + file_reader_free(fh->fr); + fh->fr = NULL; + if (fh->si) { + sort_list_item_clean(fh->si); + sort_free(fh->si); + fh->si = NULL; + } + } else { + if (fh->si == NULL) + fh->si = sort_list_item_alloc(); + sort_list_item_set(fh->si, tmp); + } + } +} + +/* + * Merge array of "files headers" + */ +static void +file_headers_merge(int fnum, struct file_header **fh, FILE *f_out) +{ + struct last_printed lp; + int i; + + memset(&lp, 0, sizeof(lp)); + + /* + * construct the initial sort structure + */ + for (i = 0; i < fnum; i++) + file_header_list_push(fh[i], fh, i); + + while (fh[0]->fr) { /* unfinished files are always in front */ + /* output the smallest line: */ + file_header_print(fh[0], f_out, &lp); + /* read a new line, if possible: */ + file_header_read_next(fh[0]); + /* re-arrange the list: */ + file_header_list_rearrange_from_header(fh, fnum); + } + + if (lp.str) + bwsfree(lp.str); +} + +/* + * Merges the given files into the output file, which can be + * stdout. + */ +static void +merge_files_array(int argc, char **argv, const char *fn_out) +{ + + if (argv && fn_out) { + struct file_header **fh; + FILE *f_out; + int i; + + f_out = openfile(fn_out, "w"); + + if (f_out == NULL) + err(2, NULL); + + fh = sort_malloc((argc + 1) * sizeof(struct file_header *)); + + for (i = 0; i < argc; i++) + file_header_init(fh + i, argv[i], (size_t) i); + + file_headers_merge(argc, fh, f_out); + + for (i = 0; i < argc; i++) + file_header_close(fh + i); + + sort_free(fh); + + closefile(f_out, fn_out); + } +} + +/* + * Shrinks the file list until its size smaller than max number of opened files + */ +static int +shrink_file_list(struct file_list *fl) +{ + + if ((fl == NULL) || (size_t) (fl->count) < max_open_files) + return (0); + else { + struct file_list new_fl; + int indx = 0; + + file_list_init(&new_fl, true); + while (indx < fl->count) { + char *fnew; + int num; + + num = fl->count - indx; + fnew = new_tmp_file_name(); + + if ((size_t) num >= max_open_files) + num = max_open_files - 1; + merge_files_array(num, fl->fns + indx, fnew); + if (fl->tmp) { + int i; + + for (i = 0; i < num; i++) + unlink(fl->fns[indx + i]); + } + file_list_add(&new_fl, fnew, false); + indx += num; + } + fl->tmp = false; /* already taken care of */ + file_list_clean(fl); + + fl->count = new_fl.count; + fl->fns = new_fl.fns; + fl->sz = new_fl.sz; + fl->tmp = new_fl.tmp; + + return (1); + } +} + +/* + * Merge list of files + */ +void +merge_files(struct file_list *fl, const char *fn_out) +{ + + if (fl && fn_out) { + while (shrink_file_list(fl)); + + merge_files_array(fl->count, fl->fns, fn_out); + } +} + +static const char * +get_sort_method_name(int sm) +{ + + if (sm == SORT_MERGESORT) + return "mergesort"; + else if (sort_opts_vals.sort_method == SORT_RADIXSORT) + return "radixsort"; + else if (sort_opts_vals.sort_method == SORT_HEAPSORT) + return "heapsort"; + else + return "quicksort"; +} + +/* + * Wrapper for qsort + */ +static int sort_qsort(void *list, size_t count, size_t elem_size, + int (*cmp_func)(const void *, const void *)) +{ + + qsort(list, count, elem_size, cmp_func); + return (0); +} + +/* + * Sort list of lines and writes it to the file + */ +void +sort_list_to_file(struct sort_list *list, const char *outfile) +{ + struct sort_mods *sm = &(keys[0].sm); + + if (!(sm->Mflag) && !(sm->Rflag) && !(sm->Vflag) && !(sm->Vflag) && + !(sm->gflag) && !(sm->hflag) && !(sm->nflag)) { + if ((sort_opts_vals.sort_method == SORT_DEFAULT) && byte_sort) + sort_opts_vals.sort_method = SORT_RADIXSORT; + + } else if (sort_opts_vals.sort_method == SORT_RADIXSORT) + err(2, getstr(9)); + + /* + * to handle stable sort and the unique cases in the + * right order, we need stable basic algorithm + */ + if (sort_opts_vals.sflag) { + switch (sort_opts_vals.sort_method){ + case SORT_MERGESORT: + break; + case SORT_RADIXSORT: + break; + case SORT_DEFAULT: + sort_opts_vals.sort_method = SORT_MERGESORT; + break; + default: + errx(2, getstr(10)); + }; + } + + if (sort_opts_vals.sort_method == SORT_DEFAULT) + sort_opts_vals.sort_method = SORT_QSORT; + + if (debug_sort) + printf("sort_method=%s\n", + get_sort_method_name(sort_opts_vals.sort_method)); + + switch (sort_opts_vals.sort_method){ + case SORT_RADIXSORT: + rxsort(list->list, list->count); + sort_list_dump(list, outfile); + break; + case SORT_MERGESORT: + mt_sort(list, mergesort, outfile); + break; + case SORT_HEAPSORT: + mt_sort(list, heapsort, outfile); + break; + default: + mt_sort(list, sort_qsort, outfile); + break; + } +} + +/******************* MT SORT ************************/ + +#if defined(SORT_THREADS) +/* semaphore to count threads */ +static sem_t mtsem; + +/* current system sort function */ +static int (*g_sort_func)(void *, size_t, size_t, + int(*)(const void *, const void *)); + +/* + * Sort cycle thread (in multi-threaded mode) + */ +static void* +mt_sort_thread(void* arg) +{ + struct sort_list *list = arg; + + g_sort_func(list->list, list->count, sizeof(struct sort_list_item *), + (int(*)(const void *, const void *)) list_coll); + + sem_post(&mtsem); + + return (arg); +} + +/* + * Compare sub-lists. Empty sub-lists always go to the end of the list. + */ +static int +sub_list_cmp(struct sort_list *l1, struct sort_list *l2) +{ + + if (l1 == l2) + return (0); + else { + if (l1->count == 0) { + return ((l2->count == 0) ? 0 : +1); + } else if (l2->count == 0) { + return (-1); + } else { + int ret; + + ret = list_coll(&(l1->list[0]), &(l2->list[0])); + if (!ret) + return ((l1->sub_list_pos < l2->sub_list_pos) ? + -1 : +1); + return (ret); + } + } +} + +/* + * Swap two array elements + */ +static void +sub_list_swap(struct sort_list **sl, int i1, int i2) +{ + struct sort_list *tmp; + + tmp = sl[i1]; + sl[i1] = sl[i2]; + sl[i2] = tmp; +} + +/* heap algorithm ==>> */ + +/* + * See heap sort algorithm + * "Raises" last element to its right place + */ +static void +sub_list_swim(struct sort_list **sl, int indx) +{ + + if (indx > 0) { + int parent_index; + + parent_index = (indx - 1) >> 1; + + if (sub_list_cmp(sl[indx], sl[parent_index]) < 0) { + /* swap child and parent and continue */ + sub_list_swap(sl, indx, parent_index); + sub_list_swim(sl, parent_index); + } + } +} + +/* + * Sink the top element to its correct position + */ +static void +sub_list_sink(struct sort_list **sl, int indx, int size) +{ + int left_child_index; + int right_child_index; + + left_child_index = indx + indx + 1; + right_child_index = left_child_index + 1; + + if (left_child_index < size) { + int min_child_index; + + min_child_index = left_child_index; + + if ((right_child_index < size) && + (sub_list_cmp(sl[left_child_index], + sl[right_child_index]) > 0)) + min_child_index = right_child_index; + if (sub_list_cmp(sl[indx], sl[min_child_index]) > 0) { + sub_list_swap(sl, indx, min_child_index); + sub_list_sink(sl, min_child_index, size); + } + } +} + +/* <<== heap algorithm */ + +/* + * Adds element to the "right" end + */ +static void +sub_list_push(struct sort_list *s, struct sort_list **sl, int size) +{ + + sl[size++] = s; + sub_list_swim(sl, size - 1); +} + +struct last_printed_item +{ + struct sort_list_item *item; +}; + +/* + * Prints the current line of the file + */ +static void +sub_list_header_print(struct sort_list *sl, FILE *f_out, + struct last_printed_item *lp) +{ + + if (sl && sl->count && f_out && sl->list[0]->str) { + if (sort_opts_vals.uflag) { + if ((lp->item == NULL) || (list_coll(&(lp->item), + &(sl->list[0])))) { + bwsfwrite(sl->list[0]->str, f_out, + sort_opts_vals.zflag); + lp->item = sl->list[0]; + } + } else + bwsfwrite(sl->list[0]->str, f_out, + sort_opts_vals.zflag); + } +} + +/* + * Read next line + */ +static void +sub_list_next(struct sort_list *sl) +{ + + if (sl && sl->count) { + sl->list += 1; + sl->count -= 1; + } +} + +/* + * Merge sub-lists to a file + */ +static void +merge_sub_lists(struct sort_list **sl, size_t n, FILE* f_out) +{ + struct last_printed_item lp; + size_t i; + + memset(&lp,0,sizeof(lp)); + + /* construct the initial list: */ + for (i = 0; i < n; i++) + sub_list_push(sl[i], sl, i); + + while (sl[0]->count) { /* unfinished lists are always in front */ + /* output the smallest line: */ + sub_list_header_print(sl[0], f_out, &lp); + /* move to a new line, if possible: */ + sub_list_next(sl[0]); + /* re-arrange the list: */ + sub_list_sink(sl, 0, n); + } +} + +/* + * Merge sub-lists to a file + */ +static void +merge_list_parts(struct sort_list **parts, size_t n, const char *fn) +{ + FILE* f_out; + + f_out = openfile(fn,"w"); + + merge_sub_lists(parts, n, f_out); + + closefile(f_out, fn); +} + +#endif /* defined(SORT_THREADS) */ +/* + * Multi-threaded sort algorithm "driver" + */ +static void +mt_sort(struct sort_list *list, + int(*sort_func)(void *, size_t, size_t, int(*)(const void *, const void *)), + const char* fn) +{ +#if defined(SORT_THREADS) + if (nthreads < 2 || list->count < nthreads) { +#endif + /* if single thread or small data, do simple sort */ + sort_func(list->list, list->count, + sizeof(struct sort_list_item *), + (int(*)(const void *, const void *)) list_coll); + sort_list_dump(list, fn); +#if defined(SORT_THREADS) + } else { + /* multi-threaded sort */ + struct sort_list **parts; + size_t avgsize, cstart, i; + + /* array of sub-lists */ + parts = sort_malloc(sizeof(struct sort_list*) * nthreads); + cstart = 0; + avgsize = list->count / nthreads; + + /* set global system sort function */ + g_sort_func = sort_func; + + /* set sublists */ + for (i = 0; i < nthreads; ++i) { + size_t sz = 0; + + parts[i] = sort_malloc(sizeof(struct sort_list)); + parts[i]->list = list->list + cstart; + parts[i]->memsize = 0; + parts[i]->sub_list_pos = i; + + sz = (i == nthreads - 1) ? list->count - cstart : + avgsize; + + parts[i]->count = sz; + + parts[i]->size = parts[i]->count; + + cstart += sz; + } + + /* init threads counting semaphore */ + sem_init(&mtsem, 0, 0); + + /* start threads */ + for (i = 0; i < nthreads; ++i) { + pthread_t pth; + pthread_attr_t attr; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_DETACHED); + + pthread_create(&pth, &attr, mt_sort_thread, parts[i]); + + pthread_attr_destroy(&attr); + } + + /* wait for threads completion */ + for (i = 0; i < nthreads; ++i) { + sem_wait(&mtsem); + } + /* destroy the semaphore - we do not need it anymore */ + sem_destroy(&mtsem); + + /* merge sorted sub-lists to the file */ + merge_list_parts(parts, nthreads, fn); + + /* free sub-lists data */ + for (i = 0; i < nthreads; ++i) { + sort_free(parts[i]); + } + sort_free(parts); + } +#endif /* defined(SORT_THREADS) */ +} diff --git a/usr.bin/sort/file.h b/usr.bin/sort/file.h new file mode 100644 index 000000000000..fd2392221e5b --- /dev/null +++ b/usr.bin/sort/file.h @@ -0,0 +1,132 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2009 Gabor Kovesdan + * Copyright (C) 2012 Oleg Moskalenko + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if !defined(__SORT_FILE_H__) +#define __SORT_FILE_H__ + +#include "coll.h" +#include "sort.h" + +#define SORT_DEFAULT 0 +#define SORT_QSORT 1 +#define SORT_MERGESORT 2 +#define SORT_HEAPSORT 3 +#define SORT_RADIXSORT 4 + +/* + * List of data to be sorted. + */ +struct sort_list +{ + struct sort_list_item **list; + unsigned long long memsize; + size_t count; + size_t size; + size_t sub_list_pos; +}; + +/* + * File reader object + */ +struct file_reader; + +/* + * List of files to be sorted + */ +struct file_list +{ + char **fns; + int count; + int sz; + bool tmp; +}; + +/* + * Structure for zero-separated file reading (for input files list) + */ +struct file0_reader +{ + char *current_line; + FILE *f; + size_t current_sz; +}; + +/* memory */ + +/**/ + +extern unsigned long long free_memory; +extern unsigned long long available_free_memory; + +/* temporary file dir */ + +extern const char *tmpdir; + +/* + * Max number of simultaneously open files (including the output file). + */ +extern size_t max_open_files; + +/* + * Compress program + */ +extern const char* compress_program; + +/* funcs */ + +struct file_reader *file_reader_init(const char *fsrc); +struct bwstring *file_reader_readline(struct file_reader *fr); +void file_reader_free(struct file_reader *fr); + +char *read_file0_line(struct file0_reader *f0r); + +void init_tmp_files(void); +void clear_tmp_files(void); +char *new_tmp_file_name(void); +void tmp_file_atexit(const char *tmp_file); + +void file_list_init(struct file_list *fl, bool tmp); +void file_list_add(struct file_list *fl, char *fn, bool allocate); +void file_list_populate(struct file_list *fl, int argc, char **argv, bool allocate); +void file_list_clean(struct file_list *fl); + +int check(const char *); +void merge_files(struct file_list *fl, const char *fn_out); +FILE *openfile(const char *, const char *); +void closefile(FILE *, const char *); +int procfile(const char *fn, struct sort_list *list, struct file_list *fl); + +void sort_list_init(struct sort_list *l); +void sort_list_add(struct sort_list *l, struct bwstring *str); +void sort_list_clean(struct sort_list *l); +void sort_list_dump(struct sort_list *l, const char *fn); + +void sort_list_to_file(struct sort_list *list, const char *outfile); + +#endif /* __SORT_FILE_H__ */ diff --git a/usr.bin/sort/mem.c b/usr.bin/sort/mem.c new file mode 100644 index 000000000000..999aea2b0060 --- /dev/null +++ b/usr.bin/sort/mem.c @@ -0,0 +1,82 @@ +/*- + * Copyright (C) 2009 Gabor Kovesdan + * Copyright (C) 2012 Oleg Moskalenko + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include "mem.h" + +/* + * malloc() wrapper. + */ +void * +sort_malloc(size_t size) +{ + void *ptr; + + if ((ptr = malloc(size)) == NULL) + err(2, NULL); + return (ptr); +} + +/* + * free() wrapper. + */ +void +sort_free(const void *ptr) +{ + + if (ptr) + free(__DECONST(void *, ptr)); +} + +/* + * realloc() wrapper. + */ +void * +sort_realloc(void *ptr, size_t size) +{ + + if ((ptr = realloc(ptr, size)) == NULL) + err(2, NULL); + return (ptr); +} + +char * +sort_strdup(const char *str) +{ + char *dup; + + if ((dup = strdup(str)) == NULL) + err(2, NULL); + return (dup); +} diff --git a/usr.bin/sort/mem.h b/usr.bin/sort/mem.h new file mode 100644 index 000000000000..13ba4a9aa7a6 --- /dev/null +++ b/usr.bin/sort/mem.h @@ -0,0 +1,45 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2009 Gabor Kovesdan + * Copyright (C) 2012 Oleg Moskalenko + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if !defined(__SORT_MEM_H__) +#define __SORT_MEM_H__ + +#include +#include +#include + +/* + * mem.c + */ +void *sort_malloc(size_t); +void sort_free(const void *ptr); +void *sort_realloc(void *, size_t); +char *sort_strdup(const char *); + +#endif /* __SORT_MEM_H__ */ diff --git a/usr.bin/sort/nls/C.msg b/usr.bin/sort/nls/C.msg new file mode 100644 index 000000000000..aa67f425c356 --- /dev/null +++ b/usr.bin/sort/nls/C.msg @@ -0,0 +1,10 @@ +$ $FreeBSD$ +$ +$set 1 +$quote " +1 "you cannot use -%c and -%c together" +2 "extra argument not allowed with -c" +3 "Unknown feature: %s\n" +4 "Wrong memory buffer specification" +5 "0 field in key specs\n" +6 "0 column in key specs\n" diff --git a/usr.bin/sort/nls/hu_HU.ISO8859-2.msg b/usr.bin/sort/nls/hu_HU.ISO8859-2.msg new file mode 100644 index 000000000000..cc338166fada --- /dev/null +++ b/usr.bin/sort/nls/hu_HU.ISO8859-2.msg @@ -0,0 +1,10 @@ +$ $FreeBSD$ +$ +$set 1 +$quote " +1 "a -%c és -%c opciók nem használhatók együtt" +2 "extra argumentum a -%c opcióval" +3 "Ismeretlen funkció: %s\n" +4 "Rossz memória puffer érték" +5 "0 mezõ a kulcsspecifikációban\n" +6 "0 oszlop a kulcsspecifikációban\n" diff --git a/usr.bin/sort/radixsort.c b/usr.bin/sort/radixsort.c new file mode 100644 index 000000000000..5d0dea750ec1 --- /dev/null +++ b/usr.bin/sort/radixsort.c @@ -0,0 +1,674 @@ +/*- + * Copyright (C) 2012 Oleg Moskalenko + * Copyright (C) 2012 Gabor Kovesdan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#if defined(SORT_THREADS) +#include +#include +#endif +#include +#include +#include +#include +#include + +#include "coll.h" +#include "radixsort.h" + +#define TINY_NODE(sl) ((sl)->tosort_num < 65) +#define SMALL_NODE(sl) ((sl)->tosort_num < 5) + +/* are we sorting in reverse order ? */ +static bool reverse_sort = false; + +/* sort sub-levels array size */ +static const size_t slsz = 256 * sizeof(struct sort_level*); + +/* one sort level structure */ +struct sort_level +{ + struct sort_level **sublevels; + struct sort_list_item **leaves; + struct sort_list_item **sorted; + struct sort_list_item **tosort; + size_t leaves_num; + size_t leaves_sz; + size_t level; + size_t real_sln; + size_t start_position; + size_t sln; + size_t tosort_num; + size_t tosort_sz; +}; + +/* stack of sort levels ready to be sorted */ +struct level_stack { + struct level_stack *next; + struct sort_level *sl; +}; + +static struct level_stack *g_ls = NULL; + +#if defined(SORT_THREADS) +/* stack guarding mutex */ +static pthread_mutex_t g_ls_mutex; + +/* counter: how many items are left */ +static size_t sort_left = 0; +/* guarding mutex */ +static pthread_mutex_t sort_left_mutex; + +/* semaphore to count threads */ +static sem_t mtsem; + +/* + * Decrement items counter + */ +static inline void +sort_left_dec(size_t n) +{ + + pthread_mutex_lock(&sort_left_mutex); + sort_left -= n; + pthread_mutex_unlock(&sort_left_mutex); +} + +/* + * Do we have something to sort ? + */ +static inline bool +have_sort_left(void) +{ + bool ret; + + pthread_mutex_lock(&sort_left_mutex); + ret = (sort_left > 0); + pthread_mutex_unlock(&sort_left_mutex); + return (ret); +} + +#else + +#define sort_left_dec(n) + +#endif /* SORT_THREADS */ + +/* + * Push sort level to the stack + */ +static inline void +push_ls(struct sort_level* sl) +{ + struct level_stack *new_ls; + + new_ls = sort_malloc(sizeof(struct level_stack)); + new_ls->sl = sl; + +#if defined(SORT_THREADS) + if (nthreads > 1) + pthread_mutex_lock(&g_ls_mutex); +#endif + + new_ls->next = g_ls; + g_ls = new_ls; + +#if defined(SORT_THREADS) + if (nthreads > 1) + pthread_mutex_unlock(&g_ls_mutex); +#endif +} + +/* + * Pop sort level from the stack (single-threaded style) + */ +static inline struct sort_level* +pop_ls_st(void) +{ + struct sort_level *sl; + + if (g_ls) { + struct level_stack *saved_ls; + + sl = g_ls->sl; + saved_ls = g_ls; + g_ls = g_ls->next; + sort_free(saved_ls); + } else + sl = NULL; + + return (sl); +} + +/* + * Pop sort level from the stack (multi-threaded style) + */ +static inline struct sort_level* +pop_ls_mt(void) +{ + struct level_stack *saved_ls; + struct sort_level *sl; + +#if defined(SORT_THREADS) + pthread_mutex_lock(&g_ls_mutex); +#endif + + if (g_ls) { + sl = g_ls->sl; + saved_ls = g_ls; + g_ls = g_ls->next; + } else { + sl = NULL; + saved_ls = NULL; + } + +#if defined(SORT_THREADS) + pthread_mutex_unlock(&g_ls_mutex); +#endif + + sort_free(saved_ls); + + return (sl); +} + +static void +add_to_sublevel(struct sort_level *sl, struct sort_list_item *item, int indx) +{ + struct sort_level *ssl; + + ssl = sl->sublevels[indx]; + + if (ssl == NULL) { + ssl = sort_malloc(sizeof(struct sort_level)); + memset(ssl, 0, sizeof(struct sort_level)); + + ssl->level = sl->level + 1; + sl->sublevels[indx] = ssl; + + ++(sl->real_sln); + } + + if (++(ssl->tosort_num) > ssl->tosort_sz) { + ssl->tosort_sz = ssl->tosort_num + 128; + ssl->tosort = sort_realloc(ssl->tosort, + sizeof(struct sort_list_item*) * (ssl->tosort_sz)); + } + + ssl->tosort[ssl->tosort_num - 1] = item; +} + +static inline void +add_leaf(struct sort_level *sl, struct sort_list_item *item) +{ + + if (++(sl->leaves_num) > sl->leaves_sz) { + sl->leaves_sz = sl->leaves_num + 128; + sl->leaves = sort_realloc(sl->leaves, + (sizeof(struct sort_list_item*) * (sl->leaves_sz))); + } + sl->leaves[sl->leaves_num - 1] = item; +} + +static inline int +get_wc_index(struct sort_list_item *sli, size_t level) +{ + const struct bwstring *bws; + + bws = sli->ka.key[0].k; + + if ((BWSLEN(bws) > level)) + return (unsigned char) BWS_GET(bws,level); + return (-1); +} + +static void +place_item(struct sort_level *sl, size_t item) +{ + struct sort_list_item *sli; + int c; + + sli = sl->tosort[item]; + c = get_wc_index(sli, sl->level); + + if (c == -1) + add_leaf(sl, sli); + else + add_to_sublevel(sl, sli, c); +} + +static void +free_sort_level(struct sort_level *sl) +{ + + if (sl) { + if (sl->leaves) + sort_free(sl->leaves); + + if (sl->level > 0) + sort_free(sl->tosort); + + if (sl->sublevels) { + struct sort_level *slc; + size_t sln; + + sln = sl->sln; + + for (size_t i = 0; i < sln; ++i) { + slc = sl->sublevels[i]; + if (slc) + free_sort_level(slc); + } + + sort_free(sl->sublevels); + } + + sort_free(sl); + } +} + +static void +run_sort_level_next(struct sort_level *sl) +{ + struct sort_level *slc; + size_t i, sln, tosort_num; + + if (sl->sublevels) { + sort_free(sl->sublevels); + sl->sublevels = NULL; + } + + switch (sl->tosort_num){ + case 0: + goto end; + case (1): + sl->sorted[sl->start_position] = sl->tosort[0]; + sort_left_dec(1); + goto end; + case (2): + if (list_coll_offset(&(sl->tosort[0]), &(sl->tosort[1]), + sl->level) > 0) { + sl->sorted[sl->start_position++] = sl->tosort[1]; + sl->sorted[sl->start_position] = sl->tosort[0]; + } else { + sl->sorted[sl->start_position++] = sl->tosort[0]; + sl->sorted[sl->start_position] = sl->tosort[1]; + } + sort_left_dec(2); + + goto end; + default: + if (TINY_NODE(sl) || (sl->level > 15)) { + listcoll_t func; + + func = get_list_call_func(sl->level); + + sl->leaves = sl->tosort; + sl->leaves_num = sl->tosort_num; + sl->leaves_sz = sl->leaves_num; + sl->leaves = sort_realloc(sl->leaves, + (sizeof(struct sort_list_item *) * + (sl->leaves_sz))); + sl->tosort = NULL; + sl->tosort_num = 0; + sl->tosort_sz = 0; + sl->sln = 0; + sl->real_sln = 0; + if (sort_opts_vals.sflag) { + if (mergesort(sl->leaves, sl->leaves_num, + sizeof(struct sort_list_item *), + (int(*)(const void *, const void *)) func) == -1) + /* NOTREACHED */ + err(2, "Radix sort error 3"); + } else + qsort(sl->leaves, sl->leaves_num, + sizeof(struct sort_list_item *), + (int(*)(const void *, const void *)) func); + + memcpy(sl->sorted + sl->start_position, + sl->leaves, sl->leaves_num * + sizeof(struct sort_list_item*)); + + sort_left_dec(sl->leaves_num); + + goto end; + } else { + sl->tosort_sz = sl->tosort_num; + sl->tosort = sort_realloc(sl->tosort, + sizeof(struct sort_list_item*) * (sl->tosort_sz)); + } + } + + sl->sln = 256; + sl->sublevels = sort_malloc(slsz); + memset(sl->sublevels, 0, slsz); + + sl->real_sln = 0; + + tosort_num = sl->tosort_num; + for (i = 0; i < tosort_num; ++i) + place_item(sl, i); + + sort_free(sl->tosort); + sl->tosort = NULL; + sl->tosort_num = 0; + sl->tosort_sz = 0; + + if (sl->leaves_num > 1) { + if (keys_num > 1) { + if (sort_opts_vals.sflag) { + mergesort(sl->leaves, sl->leaves_num, + sizeof(struct sort_list_item *), + (int(*)(const void *, const void *)) list_coll); + } else { + qsort(sl->leaves, sl->leaves_num, + sizeof(struct sort_list_item *), + (int(*)(const void *, const void *)) list_coll); + } + } else if (!sort_opts_vals.sflag && sort_opts_vals.complex_sort) { + qsort(sl->leaves, sl->leaves_num, + sizeof(struct sort_list_item *), + (int(*)(const void *, const void *)) list_coll_by_str_only); + } + } + + sl->leaves_sz = sl->leaves_num; + sl->leaves = sort_realloc(sl->leaves, (sizeof(struct sort_list_item *) * + (sl->leaves_sz))); + + if (!reverse_sort) { + memcpy(sl->sorted + sl->start_position, sl->leaves, + sl->leaves_num * sizeof(struct sort_list_item*)); + sl->start_position += sl->leaves_num; + sort_left_dec(sl->leaves_num); + + sort_free(sl->leaves); + sl->leaves = NULL; + sl->leaves_num = 0; + sl->leaves_sz = 0; + + sln = sl->sln; + + for (i = 0; i < sln; ++i) { + slc = sl->sublevels[i]; + + if (slc) { + slc->sorted = sl->sorted; + slc->start_position = sl->start_position; + sl->start_position += slc->tosort_num; + if (SMALL_NODE(slc)) + run_sort_level_next(slc); + else + push_ls(slc); + sl->sublevels[i] = NULL; + } + } + + } else { + size_t n; + + sln = sl->sln; + + for (i = 0; i < sln; ++i) { + n = sln - i - 1; + slc = sl->sublevels[n]; + + if (slc) { + slc->sorted = sl->sorted; + slc->start_position = sl->start_position; + sl->start_position += slc->tosort_num; + if (SMALL_NODE(slc)) + run_sort_level_next(slc); + else + push_ls(slc); + sl->sublevels[n] = NULL; + } + } + + memcpy(sl->sorted + sl->start_position, sl->leaves, + sl->leaves_num * sizeof(struct sort_list_item*)); + sort_left_dec(sl->leaves_num); + } + +end: + free_sort_level(sl); +} + +/* + * Single-threaded sort cycle + */ +static void +run_sort_cycle_st(void) +{ + struct sort_level *slc; + + for (;;) { + slc = pop_ls_st(); + if (slc == NULL) { + break; + } + run_sort_level_next(slc); + } +} + +#if defined(SORT_THREADS) + +/* + * Multi-threaded sort cycle + */ +static void +run_sort_cycle_mt(void) +{ + struct sort_level *slc; + + for (;;) { + slc = pop_ls_mt(); + if (slc == NULL) { + if (have_sort_left()) { + pthread_yield(); + continue; + } + break; + } + run_sort_level_next(slc); + } +} + +/* + * Sort cycle thread (in multi-threaded mode) + */ +static void* +sort_thread(void* arg) +{ + + run_sort_cycle_mt(); + + sem_post(&mtsem); + + return (arg); +} + +#endif /* defined(SORT_THREADS) */ + +static void +run_top_sort_level(struct sort_level *sl) +{ + struct sort_level *slc; + + reverse_sort = sort_opts_vals.kflag ? keys[0].sm.rflag : + default_sort_mods->rflag; + + sl->start_position = 0; + sl->sln = 256; + sl->sublevels = sort_malloc(slsz); + memset(sl->sublevels, 0, slsz); + + for (size_t i = 0; i < sl->tosort_num; ++i) + place_item(sl, i); + + if (sl->leaves_num > 1) { + if (keys_num > 1) { + if (sort_opts_vals.sflag) { + mergesort(sl->leaves, sl->leaves_num, + sizeof(struct sort_list_item *), + (int(*)(const void *, const void *)) list_coll); + } else { + qsort(sl->leaves, sl->leaves_num, + sizeof(struct sort_list_item *), + (int(*)(const void *, const void *)) list_coll); + } + } else if (!sort_opts_vals.sflag && sort_opts_vals.complex_sort) { + qsort(sl->leaves, sl->leaves_num, + sizeof(struct sort_list_item *), + (int(*)(const void *, const void *)) list_coll_by_str_only); + } + } + + if (!reverse_sort) { + memcpy(sl->tosort + sl->start_position, sl->leaves, + sl->leaves_num * sizeof(struct sort_list_item*)); + sl->start_position += sl->leaves_num; + sort_left_dec(sl->leaves_num); + + for (size_t i = 0; i < sl->sln; ++i) { + slc = sl->sublevels[i]; + + if (slc) { + slc->sorted = sl->tosort; + slc->start_position = sl->start_position; + sl->start_position += slc->tosort_num; + push_ls(slc); + sl->sublevels[i] = NULL; + } + } + + } else { + size_t n; + + for (size_t i = 0; i < sl->sln; ++i) { + + n = sl->sln - i - 1; + slc = sl->sublevels[n]; + + if (slc) { + slc->sorted = sl->tosort; + slc->start_position = sl->start_position; + sl->start_position += slc->tosort_num; + push_ls(slc); + sl->sublevels[n] = NULL; + } + } + + memcpy(sl->tosort + sl->start_position, sl->leaves, + sl->leaves_num * sizeof(struct sort_list_item*)); + + sort_left_dec(sl->leaves_num); + } + +#if defined(SORT_THREADS) + if (nthreads < 2) { +#endif + run_sort_cycle_st(); +#if defined(SORT_THREADS) + } else { + size_t i; + + for(i = 0; i < nthreads; ++i) { + pthread_attr_t attr; + pthread_t pth; + + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, + PTHREAD_DETACHED); + + pthread_create(&pth, &attr, sort_thread, NULL); + + pthread_attr_destroy(&attr); + } + + for(i = 0; i < nthreads; ++i) + sem_wait(&mtsem); + } +#endif /* defined(SORT_THREADS) */ +} + +static void +run_sort(struct sort_list_item **base, size_t nmemb) +{ + struct sort_level *sl; + +#if defined(SORT_THREADS) + if (nthreads > 1) { + pthread_mutexattr_t mattr; + + pthread_mutexattr_init(&mattr); + pthread_mutexattr_settype(&mattr, PTHREAD_MUTEX_ADAPTIVE_NP); + + pthread_mutex_init(&g_ls_mutex, &mattr); + pthread_mutex_init(&sort_left_mutex, &mattr); + + pthread_mutexattr_destroy(&mattr); + + sem_init(&mtsem, 0, 0); + + } +#endif + + sl = sort_malloc(sizeof(struct sort_level)); + memset(sl, 0, sizeof(struct sort_level)); + + sl->tosort = base; + sl->tosort_num = nmemb; + sl->tosort_sz = nmemb; + +#if defined(SORT_THREADS) + sort_left = nmemb; +#endif + + run_top_sort_level(sl); + + free_sort_level(sl); + +#if defined(SORT_THREADS) + if (nthreads > 1) { + sem_destroy(&mtsem); + pthread_mutex_destroy(&g_ls_mutex); + pthread_mutex_destroy(&sort_left_mutex); + } +#endif +} + +void +rxsort(struct sort_list_item **base, size_t nmemb) +{ + + run_sort(base, nmemb); +} diff --git a/usr.bin/sort/radixsort.h b/usr.bin/sort/radixsort.h new file mode 100644 index 000000000000..5276d7162c59 --- /dev/null +++ b/usr.bin/sort/radixsort.h @@ -0,0 +1,38 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2012 Oleg Moskalenko + * Copyright (C) 2012 Gabor Kovesdan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if !defined(__SORT_RADIX_H__) +#define __SORT_RADIX_H__ + +#include "coll.h" +#include "sort.h" + +void rxsort(struct sort_list_item **base, size_t nmemb); + +#endif /* __SORT_RADIX_H__ */ diff --git a/usr.bin/sort/sort.1.in b/usr.bin/sort/sort.1.in new file mode 100644 index 000000000000..e24f3538d4de --- /dev/null +++ b/usr.bin/sort/sort.1.in @@ -0,0 +1,635 @@ +.\" $OpenBSD: sort.1,v 1.31 2007/08/21 21:22:37 millert Exp $ +.\" $FreeBSD$ +.\" +.\" Copyright (c) 1991, 1993 +.\" The Regents of the University of California. All rights reserved. +.\" +.\" This code is derived from software contributed to Berkeley by +.\" the Institute of Electrical and Electronics Engineers, Inc. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" 3. Neither the name of the University nor the names of its contributors +.\" may be used to endorse or promote products derived from this software +.\" without specific prior written permission. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" @(#)sort.1 8.1 (Berkeley) 6/6/93 +.\" +.Dd May 6, 2012 +.Dt SORT 1 +.Os +.Sh NAME +.Nm sort +.Nd sort or merge records (lines) of text and binary files +.Sh SYNOPSIS +.Nm sort +.Bk -words +.Op Fl bcCdfghiRMmnrsuVz +.Sm off +.Op Fl k\ \& Ar field1 Op , Ar field2 +.Sm on +.Op Fl S Ar memsize +.Ek +.Op Fl T Ar dir +.Op Fl t Ar char +.Op Fl o Ar output +.Op Ar file ... +.Nm sort +.Fl Fl help +.Nm sort +.Fl Fl version +.Sh DESCRIPTION +The +.Nm +utility sorts text and binary files by lines. +A line is a record separated from the subsequent record by a +newline (default) or NUL \'\\0\' character (-z option). +A record can contain any printable or unprintable characters. +Comparisons are based on one or more sort keys extracted from +each line of input, and are performed lexicographically, +according to the current locale's collating rules and the +specified command-line options that can tune the actual +sorting behavior. +By default, if keys are not given, +.Nm +uses entire lines for comparison. +.Pp +The command line options are as follows: +.Bl -tag -width Ds +.It Fl c, Fl Fl check, Fl C, Fl Fl check=silent|quiet +Check that the single input file is sorted. +If the file is not sorted, +.Nm +produces the appropriate error messages and exits with code 1, +otherwise returns 0. +If +.Fl C +or +.Fl Fl check=silent +is specified, +.Nm +produces no output. +This is a "silent" version of +.Fl c. +.It Fl m , Fl Fl merge +Merge only. +The input files are assumed to be pre-sorted. +If they are not sorted the output order is undefined. +.It Fl o Ar output , Fl Fl output Ns = Ns Ar output +Print the output to the +.Ar output +file instead of the standard output. +.It Fl S Ar size, Fl Fl buffer-size Ns = Ns Ar size +Use +.Ar size +for the maximum size of the memory buffer. +Size modifiers %,b,K,M,G,T,P,E,Z,Y can be used. +If a memory limit is not explicitly specified, +.Nm +takes up to about 90% of available memory. +If the file size is too big to fit into the memory buffer, +the temporary disk files are used to perform the sorting. +.It Fl T Ar dir , Fl Fl temporary-directory Ns = Ns Ar dir +Store temporary files in the directory +.Ar dir . +The default path is the value of the environment variable +.Ev TMPDIR +or +.Pa /var/tmp +if +.Ev TMPDIR +is not defined. +.It Fl u , Fl Fl unique +Unique keys. +Suppress all lines that have a key that is equal to an already +processed one. +This option, similarly to +.Fl s , +implies a stable sort. +If used with +.Fl c +or +.Fl C , +.Nm +also checks that there are no lines with duplicate keys. +.It Fl s +Stable sort. +This option maintains the original record order of records that have +and equal key. +This is a non-standard feature, but it is widely accepted and used. +.It Fl Fl version +Print the version and silently exits. +.It Fl Fl help +Print the help text and silently exits. +.El +.Pp +The following options override the default ordering rules. +When ordering options appear independently of key field +specifications, they apply globally to all sort keys. +When attached to a specific key (see +.Fl k ) , +the ordering options override all global ordering options for +the key they are attahced to. +.Bl -tag -width indent +.It Fl b, Fl Fl ignore-leading-blanks +Ignore leading blank characters when comparing lines. +.It Fl d , Fl Fl dictionary-order +Consider only blank spaces and alphanumeric characters in comparisons. +.It Fl f , Fl Fl ignore-case +Convert all lowercase characters to their uppercase equivalent +before comparison, that is, perform case-independent sorting. +.It Fl g, Fl Fl general-numeric-sort, Fl Fl sort=general-numeric +Sort by general numerical value. +As opposed to +.Fl n , +this option handles general floating points, which have a much +permissive format than those allowed by +. Fl n , +but it has a significant performance drawback. +.It Fl h, Fl Fl human-numeric-sort, Fl Fl sort=human-numeric +Sort by numerical value, but take into account the SI suffix, +if present. +Sort first by numeric sign (negative, zero, or +positive); then by SI suffix (either empty, or `k' or `K', or one +of `MGTPEZY', in that order); and finally by numeric value. +The SI suffix must immediately follow the number. +For example, '12345K' sorts before '1M', because M is "larger" than K. +This sort option is useful for sorting the output of a single invocation +of 'df' command with +.Fl h +or +.Fl H +options (human-readable). +.It Fl i , Fl Fl ignore-nonprinting +Ignore all non-printable characters. +.It Fl M, Fl Fl month-sort, Fl Fl sort=month +Sort by month abbreviations. +Unknown strings are considered smaller than the month names. +.It Fl n , Fl Fl numeric-sort, Fl Fl sort=numeric +Sort fields numerically by arithmetic value. +Fields are supposed to have optional blanks in the beginning, an +optional minus sign, zero or more digits (including decimal point and +possible thousand separators). +.It Fl R, Fl Fl random-sort, Fl Fl sort=random +Sort by a random order. +This is a random permutation of the inputs except that +the equal keys sort together. +It is implemented by hashing the input keys and sorting +the hash values. +The hash function is choosen randomly. +The hash function is randomized by +.Cm /dev/random +content, or by file content if it is specified by +.Fl Fl random-source . +Even if multiple sort fields are specified, +the same random hash function is used for all of them. +.It Fl r , Fl Fl reverse +Sort in reverse order. +.It Fl V, Fl Fl version-sort +Sort version numbers. +The input lines are treated as file names in form +PREFIX VERSION SUFFIX, where SUFFIX matches the regular expression +"(\.([A-Za-z~][A-Za-z0-9~]*)?)*". +The files are compared by their prefixes and versions (leading +zeros are ignored in version numbers, see example below). +If an input string does not match the pattern, then it is compared +using the byte compare function. +All string comparisions are performed in C locale, the locale +environment setting is ignored. +.Bl -tag -width indent +.It Example: +.It $ ls sort* | sort -V +.It sort-1.022.tgz +.It sort-1.23.tgz +.It sort-1.23.1.tgz +.It sort-1.024.tgz +.It sort-1.024.003. +.It sort-1.024.003.tgz +.It sort-1.024.07.tgz +.It sort-1.024.009.tgz +.El +.El +.Pp +The treatment of field separators can be altered using these options: +.Bl -tag -width indent +.It Fl b , Fl Fl ignore-leading-blanks +Ignore leading blank space when determining the start +and end of a restricted sort key (see +.Fl k +). +If +.Fl b +is specified before the first +.Fl k +option, it applies globally to all key specifications. +Otherwise, +.Fl b +can be attached independently to each +.Ar field +argument of the key specifications. +.Fl b . +.It Xo +.Sm off +.Fl k\ \& Ar field1 Op , Ar field2 , Fl Fl key Ns = Ns Ar field1 Op , Ar field2 +.Sm on +.Xc +Define a restricted sort key that has the starting position +.Ar field1 , +and optional ending position +.Ar field2 +of a key field. +The +.Fl k +option may be specified multiple times, +in which case subsequent keys are compared when earlier keys compare equal. +The +.Fl k +option replaces the obsolete options +.Cm \(pl Ns Ar pos1 +and +.Fl Ns Ar pos2 , +but the old notation is also supported. +.It Fl t Ar char , Fl Fl field-separator Ns = Ns Ar char +Use +.Ar char +as a field separator character. +The initial +.Ar char +is not considered to be part of a field when determining key offsets. +Each occurrence of +.Ar char +is significant (for example, +.Dq Ar charchar +delimits an empty field). +If +.Fl t +is not specified, the default field separator is a sequence of +blank space characters, and consecutive blank spaces do +.Em not +delimit an empty field, however, the initial blank space +.Em is +considered part of a field when determining key offsets. +To use NUL as field separator, use +.Fl t +\'\\0\'. +.It Fl z , Fl Fl zero-terminated +Use NUL as record separator. +By default, records in the files are supposed to be separated by +the newline characters. +With this option, NUL (\'\\0\') is used as a record separator character. +.El +.Pp +Other options: +.Bl -tag -width indent +.It Fl Fl batch-size Ns = Ns Ar num +Specify maximum number of files that can be opened by +.Nm +at once. +This option affects behavior when having many input files or using +temporary files. +The default value is 16. +.It Fl Fl compress-program Ns = Ns Ar PROGRAM +Use PROGRAM to compress temporary files. +PROGRAM must compress standard input to standard output, when called +without arguments. +When called with argument +.Fl d +it must decompress standard input to standard output. +If PROGRAM fails, +.Nm +must exit with error. +An example of PROGRAM that can be used here is bzip2. +.It Fl Fl random-source Ns = Ns Ar filename +In random sort, the file content is used as the source of the 'seed' data +for the hash function choice. +Two invocations of random sort with the same seed data will use +the same hash function and will produce the same result if the input is +also identical. +By default, file +.Cm /dev/random +is used. +.It Fl Fl debug +Print some extra information about the sorting process to the +standard output. +%%THREADS%%.It Fl Fl nthreads +%%THREADS%%Set the maximum number of execution threads. +%%THREADS%%Default number equals to the number of CPUs. +.It Fl Fl files0-from Ns = Ns Ar filename +Take the input file list from the file +.Ar filename. +The file names must be separated by NUL +(like the output produced by the command "find ... -print0"). +.It Fl Fl radixsort +Try to use radix sort, if the sort specifications allow. +The radix sort can only be used for trivial locales (C and POSIX), +and it cannot be used for numeric or month sort. +Radix sort is very fast and stable. +.It Fl Fl mergesort +Use mergesort. +This is a universal algorithm that can always be used, +but it is not always the fastest. +.It Fl Fl qsort +Try to use quick sort, if the sort specifications allow. +This sort algorithm cannot be used with +.Fl u +and +.Fl s . +.It Fl Fl heapsort +Try to use heap sort, if the sort specifications allow. +This sort algorithm cannot be used with +.Fl u +and +.Fl s . +.El +.Pp +The following operands are available: +.Bl -tag -width indent +.It Ar file +The pathname of a file to be sorted, merged, or checked. +If no +.Ar file +operands are specified, or if a +.Ar file +operand is +.Fl , +the standard input is used. +.El +.Pp +A field is defined as a maximal sequence of characters other than the +field separator and record separator (newline by default). +Initial blank spaces are included in the field unless +.Fl b +has been specified; +the first blank space of a sequence of blank spaces acts as the field +separator and is included in the field (unless +.Fl t +is specified). +For example, all blank spaces at the beginning of a line are +considered to be part of the first field. +.Pp +Fields are specified by the +.Sm off +.Fl k\ \& Ar field1 Op , Ar field2 +.Sm on +command-line option. +If +.Ar field2 +is missing, the end of the key defaults to the end of the line. +.Pp +The arguments +.Ar field1 +and +.Ar field2 +have the form +.Em m.n +.Em (m,n > 0) +and can be followed by one or more of the modifiers +.Cm b , d , f , i , +.Cm n , g , M +and +.Cm r , +which correspond to the options discussed above. +When +.Cm b +is specified it applies only to +.Ar field1 +or +.Ar field2 +where it is specified while the rest of the modifiers +apply to the whole key field regardless if they are +specified only with +.Ar field1 +or +.Ar field2 +or both. +A +.Ar field1 +position specified by +.Em m.n +is interpreted as the +.Em n Ns th +character from the beginning of the +.Em m Ns th +field. +A missing +.Em \&.n +in +.Ar field1 +means +.Ql \&.1 , +indicating the first character of the +.Em m Ns th +field; if the +.Fl b +option is in effect, +.Em n +is counted from the first non-blank character in the +.Em m Ns th +field; +.Em m Ns \&.1b +refers to the first non-blank character in the +.Em m Ns th +field. +.No 1\&. Ns Em n +refers to the +.Em n Ns th +character from the beginning of the line; +if +.Em n +is greater than the length of the line, the field is taken to be empty. +.Pp +.Em n Ns th +positions are always counted from the field beginning, even if the field +is shorter than the number of specified positions. +Thus, the key can really start from a position in a subsequent field. +.Pp +A +.Ar field2 +position specified by +.Em m.n +is interpreted as the +.Em n Ns th +character (including separators) from the beginning of the +.Em m Ns th +field. +A missing +.Em \&.n +indicates the last character of the +.Em m Ns th +field; +.Em m += \&0 +designates the end of a line. +Thus the option +.Fl k Ar v.x,w.y +is synonymous with the obsolete option +.Cm \(pl Ns Ar v-\&1.x-\&1 +.Fl Ns Ar w-\&1.y ; +when +.Em y +is omitted, +.Fl k Ar v.x,w +is synonymous with +.Cm \(pl Ns Ar v-\&1.x-\&1 +.Fl Ns Ar w\&.0 . +The obsolete +.Cm \(pl Ns Ar pos1 +.Fl Ns Ar pos2 +option is still supported, except for +.Fl Ns Ar w\&.0b , +which has no +.Fl k +equivalent. +.Sh EXIT STATUS +The +.Nm +utility shall exit with one of the following values: +.Pp +.Bl -tag -width flag -compact +.It 0 +Successfully sorted the input files or if used with +.Fl c +or +.Fl C , +the input file already met the sorting criteria. +.It 1 +On disorder (or non-uniqueness) with the +.Fl c +or +.Fl C +options. +.It 2 +An error occurred. +.El +.Sh ENVIRONMENT +.Bl -tag -width Fl +.It Ev LC_COLLATE +Locale settings to be used to determine the collation for +sorting records. +.It Ev LC_CTYPE +Locale settings to be used to case conversion and classification +of characters, that is, which characters are considered +whitespaces, etc. +.It Ev LC_MESSAGES +Locale settings that determine the language of output messages +that +.Nm +prints out. +.It Ev LC_NUMERIC +Locale settings that determine the number format used in numeric sort. +.It Ev LC_TIME +Locale settings that determine the month format used in month sort. +.It Ev LC_ALL +Locale settings that override all of the above locale settings. +This environment variable can be used to set all these settings +to the same value at once. +.It Ev LANG +Used as a last resort to determine different kinds of locale-specific +behavior if neither the respective environment variable, nor +.Ev LC_ALL +are set. +%%NLS%%.It Ev NLSPATH +%%NLS%%Path to NLS catalogs. +.It Ev TMPDIR +Path to the directory in which temporary files will be stored. +Note that +.Ev TMPDIR +may be overridden by the +.Fl T +option. +.It Ev GNUSORT_NUMERIC_COMPATIBILITY +If defined +.Fl t +will not override the locale numeric symbols, that is, thousand +separators and decimal separators. +By default, if we specify +.Fl t +with the same symbol as the thousand separator or decimal point, +the symbol will be treated as the field separator. +Older behavior was less definite; the symbol was treated as both field +separator and numeric separator, simultaneously. +This environment variable enables the old behavior. +.El +.Sh FILES +.Bl -tag -width Pa -compact +.It Pa /var/tmp/.bsdsort.PID.* +Temporary files. +.It Pa /dev/random +Default seed file for the random sort. +.El +.Sh SEE ALSO +.Xr comm 1 , +.Xr join 1 , +.Xr uniq 1 . +.Sh STANDARDS +The +.Nm +utility is compliant with the +.St -p1003.1-2008 +specification. +.Pp +The flags +.Op Fl ghRMSsTVz +are extensions to the POSIX specification. +.Pp +All long options are extensions to the specification, some of them are +provided for compatibility with GNU versions and some of them are +own extensions. +.Pp +The old key notations +.Cm \(pl Ns Ar pos1 +and +.Fl Ns Ar pos2 +come from older versions of +.Nm +and are still supported but their use is highly discouraged. +.Sh HISTORY +A +.Nm +command first appeared in +.At v3 . +.Sh AUTHORS +Gabor Kovesdan , +.Pp +Oleg Moskalenko +.Sh NOTES +This implementation of +.Nm +has no limits on input line length (other than imposed by available +memory) or any restrictions on bytes allowed within lines. +.Pp +The performance depends highly on locale settings, +efficient choice of sort keys and key complexity. +The fastest sort is with locale C, on whole lines, +with option +.Fl s. +In general, locale C is the fastest, then single-byte +locales follow and multi-byte locales as the slowest but +the correct collation order is always respected. +As for the key specification, the simpler to process the +lines the faster the search will be. +.Pp +When sorting by arithmetic value, using +.Fl n +results in much better performance than +.Fl g +so its use is encouraged +whenever possible. diff --git a/usr.bin/sort/sort.c b/usr.bin/sort/sort.c new file mode 100644 index 000000000000..ee26553531ee --- /dev/null +++ b/usr.bin/sort/sort.c @@ -0,0 +1,1324 @@ +/*- + * Copyright (C) 2009 Gabor Kovesdan + * Copyright (C) 2012 Oleg Moskalenko + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "coll.h" +#include "file.h" +#include "sort.h" + +#ifndef WITHOUT_NLS +#include +nl_catd catalog; +#endif + +#define OPTIONS "bcCdfghik:Mmno:RrsS:t:T:uVz" + +#define DEFAULT_RANDOM_SORT_SEED_FILE ("/dev/random") +#define MAX_DEFAULT_RANDOM_SEED_DATA_SIZE (1024) + +static bool need_random = false; +static const char* random_source = DEFAULT_RANDOM_SORT_SEED_FILE; +static const void* random_seed = NULL; +static size_t random_seed_size = 0; + +MD5_CTX md5_ctx; + +/* + * Default messages to use when NLS is disabled or no catalogue + * is found. + */ +const char *nlsstr[] = { "", +/* 1*/"you cannot use -%c and -%c together", +/* 2*/"extra argument not allowed with -c", +/* 3*/"Unknown feature: %s", +/* 4*/"Wrong memory buffer specification", +/* 5*/"0 field in key specs", +/* 6*/"0 column in key specs", +/* 7*/"Wrong file mode", +/* 8*/"Cannot open file for reading", +/* 9*/"Radix sort cannot be used with these sort options", +/*10*/"The chosen sort method cannot be used with stable and/or unique sort", +/*11*/"Invalid key position", +/*12*/"Usage: %s [-bcCdfigMmnrsuz] [-kPOS1[,POS2] ... ] " + "[+POS1 [-POS2]] [-S memsize] [-T tmpdir] [-t separator] " + "[-o outfile] [--batch-size size] [--files0-from file] " + "[--heapsort] [--mergesort] [--radixsort] [--qsort] " +#if defined(SORT_THREADS) + "[--nthreads thread_no] " +#endif + "[--human-numeric-sort] " + "[--version-sort] [--random-sort [--random-source file]] " + "[--compress-program program] [file ...]\n" }; + +struct sort_opts sort_opts_vals; + +bool debug_sort = false; +bool need_hint = false; + +#if defined(SORT_THREADS) +size_t ncpu = 1; +size_t nthreads = 1; +#endif + +static bool gnusort_numeric_compatibility = false; + +static struct sort_mods default_sort_mods_object; +struct sort_mods * const default_sort_mods = &default_sort_mods_object; + +static bool print_symbols_on_debug = false; + +/* + * Arguments from file (when file0-from option is used: + */ +static int argc_from_file0 = -1; +static char **argv_from_file0 = NULL; + +/* + * Placeholder symbols for options which have no single-character equivalent + */ +enum +{ + SORT_OPT = CHAR_MAX + 1, + HELP_OPT, + FF_OPT, + BS_OPT, + VERSION_OPT, + DEBUG_OPT, +#if defined(SORT_THREADS) + NTHREADS_OPT, +#endif + RANDOMSOURCE_OPT, + COMPRESSPROGRAM_OPT, + QSORT_OPT, + MERGESORT_OPT, + HEAPSORT_OPT, + RADIXSORT_OPT +}; + +#define NUMBER_OF_MUTUALLY_EXCLUSIVE_FLAGS 6 +static const char mutually_exclusive_flags[NUMBER_OF_MUTUALLY_EXCLUSIVE_FLAGS] = { 'M', 'n', 'g', 'R', 'h', 'V' }; + +struct option long_options[] = { + { "batch-size", required_argument, NULL, BS_OPT }, + { "buffer-size", required_argument, NULL, 'S' }, + { "check", optional_argument, NULL, 'c' }, + { "check=silent|quiet", optional_argument, NULL, 'C' }, + { "compress-program", required_argument, NULL, COMPRESSPROGRAM_OPT }, + { "debug", no_argument, NULL, DEBUG_OPT }, + { "dictionary-order", no_argument, NULL, 'd' }, + { "field-separator", required_argument, NULL, 't' }, + { "files0-from", required_argument, NULL, FF_OPT }, + { "general-numeric-sort", no_argument, NULL, 'g' }, + { "heapsort", no_argument, NULL, HEAPSORT_OPT }, + { "help",no_argument, NULL, HELP_OPT }, + { "human-numeric-sort", no_argument, NULL, 'h' }, + { "ignore-leading-blanks", no_argument, NULL, 'b' }, + { "ignore-case", no_argument, NULL, 'f' }, + { "ignore-nonprinting", no_argument, NULL, 'i' }, + { "key", required_argument, NULL, 'k' }, + { "merge", no_argument, NULL, 'm' }, + { "mergesort", no_argument, NULL, MERGESORT_OPT }, + { "month-sort", no_argument, NULL, 'M' }, + { "numeric-sort", no_argument, NULL, 'n' }, + { "output", required_argument, NULL, 'o' }, +#if defined(SORT_THREADS) + { "nthreads", required_argument, NULL, NTHREADS_OPT }, +#endif + { "qsort", no_argument, NULL, QSORT_OPT }, + { "radixsort", no_argument, NULL, RADIXSORT_OPT }, + { "random-sort", no_argument, NULL, 'R' }, + { "random-source", required_argument, NULL, RANDOMSOURCE_OPT }, + { "reverse", no_argument, NULL, 'r' }, + { "sort", required_argument, NULL, SORT_OPT }, + { "stable", no_argument, NULL, 's' }, + { "temporary-directory",required_argument, NULL, 'T' }, + { "unique", no_argument, NULL, 'u' }, + { "version", no_argument, NULL, VERSION_OPT }, + { "version-sort",no_argument, NULL, 'V' }, + { "zero-terminated", no_argument, NULL, 'z' }, + { NULL, no_argument, NULL, 0 } +}; + +void fix_obsolete_keys(int *argc, char **argv); + +/* + * Check where sort modifier is present + */ +static bool +sort_modifier_empty(struct sort_mods *sm) +{ + + if (sm == NULL) + return (true); + return (!(sm->Mflag || sm->Vflag || sm->nflag || sm->gflag || + sm->rflag || sm->Rflag || sm->hflag || sm->dflag || sm->fflag)); +} + +/* + * Print out usage text. + */ +static void +usage(bool opt_err) +{ + struct option *o; + FILE *out; + + out = stdout; + o = &(long_options[0]); + + if (opt_err) + out = stderr; + fprintf(out, getstr(12), getprogname()); + if (opt_err) + exit(2); + exit(0); +} + +/* + * Read input file names from a file (file0-from option). + */ +static void +read_fns_from_file0(const char *fn) +{ + if (fn) { + struct file0_reader f0r; + FILE *f; + + f = fopen(fn, "r"); + if (f == NULL) + err(2, NULL); + + memset(&f0r, 0, sizeof(f0r)); + f0r.f = f; + + while (!feof(f)) { + char *line = read_file0_line(&f0r); + + if (line && *line) { + ++argc_from_file0; + if (argc_from_file0 < 1) + argc_from_file0 = 1; + argv_from_file0 = sort_realloc(argv_from_file0, + argc_from_file0 * sizeof(char *)); + if (argv_from_file0 == NULL) + err(2, NULL); + argv_from_file0[argc_from_file0 - 1] = + sort_strdup(line); + } + } + closefile(f, fn); + } +} + +/* + * Check how much RAM is available for the sort. + */ +static void +set_hw_params(void) +{ +#if defined(SORT_THREADS) + size_t ncpusz; +#endif + size_t pages, psize, psz, pszsz; + + pages = psize = 0; +#if defined(SORT_THREADS) + ncpu = 1; + ncpusz = sizeof(size_t); +#endif + psz = pszsz = sizeof(size_t); + + if (sysctlbyname("vm.stats.vm.v_free_count", &pages, &psz, + NULL, 0) < 0) { + perror("vm.stats.vm.v_free_count"); + return; + } + if (sysctlbyname("vm.stats.vm.v_page_size", &psize, &pszsz, + NULL, 0) < 0) { + perror("vm.stats.vm.v_page_size"); + return; + } +#if defined(SORT_THREADS) + if (sysctlbyname("hw.ncpu", &ncpu, &ncpusz, + NULL, 0) < 0) + ncpu = 1; + else if(ncpu > 32) + ncpu = 32; + + nthreads = ncpu; +#endif + + free_memory = (unsigned long long) pages * (unsigned long long) psize; + available_free_memory = (free_memory * 9) / 10; +} + +/* + * Convert "plain" symbol to wide symbol, with default value. + */ +static void +conv_mbtowc(wchar_t *wc, const char *c, const wchar_t def) +{ + + if (wc && c) { + int res; + + res = mbtowc(wc, c, MB_CUR_MAX); + if (res < 1) + *wc = def; + } +} + +/* + * Set current locale symbols. + */ +static void +set_locale(void) +{ + struct lconv *lc; + const char *locale; + + setlocale(LC_ALL, ""); + + lc = localeconv(); + + if (lc) { + /* obtain LC_NUMERIC info */ + /* Convert to wide char form */ + conv_mbtowc(&symbol_decimal_point, lc->decimal_point, + symbol_decimal_point); + conv_mbtowc(&symbol_thousands_sep, lc->thousands_sep, + symbol_thousands_sep); + conv_mbtowc(&symbol_positive_sign, lc->positive_sign, + symbol_positive_sign); + conv_mbtowc(&symbol_negative_sign, lc->negative_sign, + symbol_negative_sign); + } + + if (getenv("GNUSORT_NUMERIC_COMPATIBILITY")) + gnusort_numeric_compatibility = true; + + locale = setlocale(LC_COLLATE, NULL); + + if (locale) { + char *tmpl; + const char *cclocale; + + tmpl = sort_strdup(locale); + cclocale = setlocale(LC_COLLATE, "C"); + if (cclocale && !strcmp(cclocale, tmpl)) + byte_sort = true; + else { + const char *pclocale; + + pclocale = setlocale(LC_COLLATE, "POSIX"); + if (pclocale && !strcmp(pclocale, tmpl)) + byte_sort = true; + } + setlocale(LC_COLLATE, tmpl); + sort_free(tmpl); + } +} + +/* + * Set directory temporary files. + */ +static void +set_tmpdir(void) +{ + char *td; + + td = getenv("TMPDIR"); + if (td != NULL) + tmpdir = sort_strdup(td); +} + +/* + * Parse -S option. + */ +static unsigned long long +parse_memory_buffer_value(const char *value) +{ + + if (value == NULL) + return (available_free_memory); + else { + char *endptr; + unsigned long long membuf; + + endptr = NULL; + errno = 0; + membuf = strtoll(value, &endptr, 10); + + if (errno != 0) { + warn(getstr(4)); + membuf = available_free_memory; + } else { + switch (*endptr){ + case 'Y': + membuf *= 1024; + /* FALLTHROUGH */ + case 'Z': + membuf *= 1024; + /* FALLTHROUGH */ + case 'E': + membuf *= 1024; + /* FALLTHROUGH */ + case 'P': + membuf *= 1024; + /* FALLTHROUGH */ + case 'T': + membuf *= 1024; + /* FALLTHROUGH */ + case 'G': + membuf *= 1024; + /* FALLTHROUGH */ + case 'M': + membuf *= 1024; + /* FALLTHROUGH */ + case '\0': + case 'K': + membuf *= 1024; + /* FALLTHROUGH */ + case 'b': + break; + case '%': + membuf = (available_free_memory * membuf) / + 100; + break; + default: + fprintf(stderr, "%s: %s\n", strerror(EINVAL), + optarg); + membuf = available_free_memory; + } + } + return (membuf); + } +} + +/* + * Signal handler that clears the temporary files. + */ +static void +sig_handler(int sig, siginfo_t *siginfo, void *context) +{ + + sig = sig; + siginfo = siginfo; + context = context; + clear_tmp_files(); + exit(-1); +} + +/* + * Set signal handler on panic signals. + */ +static void +set_signal_handler(void) +{ + struct sigaction sa; + + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = &sig_handler; + sa.sa_flags = SA_SIGINFO; + + if (sigaction(SIGTERM, &sa, NULL) < 0) { + perror("sigaction"); + return; + } + if (sigaction(SIGHUP, &sa, NULL) < 0) { + perror("sigaction"); + return; + } + if (sigaction(SIGINT, &sa, NULL) < 0) { + perror("sigaction"); + return; + } + if (sigaction(SIGQUIT, &sa, NULL) < 0) { + perror("sigaction"); + return; + } + if (sigaction(SIGABRT, &sa, NULL) < 0) { + perror("sigaction"); + return; + } + if (sigaction(SIGBUS, &sa, NULL) < 0) { + perror("sigaction"); + return; + } + if (sigaction(SIGSEGV, &sa, NULL) < 0) { + perror("sigaction"); + return; + } + if (sigaction(SIGUSR1, &sa, NULL) < 0) { + perror("sigaction"); + return; + } + if (sigaction(SIGUSR2, &sa, NULL) < 0) { + perror("sigaction"); + return; + } +} + +/* + * Print "unknown" message and exit with status 2. + */ +static void +unknown(const char *what) +{ + + errx(2, getstr(3), what); +} + +/* + * Check whether contradictory input options are used. + */ +static void +check_mutually_exclusive_flags(char c, bool *mef_flags) +{ + int fo_index, mec; + bool found_others, found_this; + + found_others = found_this =false; + fo_index = 0; + + for (int i = 0; i < NUMBER_OF_MUTUALLY_EXCLUSIVE_FLAGS; i++) { + mec = mutually_exclusive_flags[i]; + + if (mec != c) { + if (mef_flags[i]) { + if (found_this) + errx(1, getstr(1), c, mec); + found_others = true; + fo_index = i; + } + } else { + if (found_others) + errx(1, getstr(1), c, + mutually_exclusive_flags[fo_index]); + mef_flags[i] = true; + found_this = true; + } + } +} + +/* + * Initialise sort opts data. + */ +static void +set_sort_opts(void) +{ + + memset(&default_sort_mods_object, 0, + sizeof(default_sort_mods_object)); + memset(&sort_opts_vals, 0, sizeof(sort_opts_vals)); + default_sort_mods_object.func = + get_sort_func(&default_sort_mods_object); +} + +/* + * Set a sort modifier on a sort modifiers object. + */ +static bool +set_sort_modifier(struct sort_mods *sm, int c) +{ + + if (sm) { + switch (c){ + case 'b': + sm->bflag = true; + break; + case 'd': + sm->dflag = true; + break; + case 'f': + sm->fflag = true; + break; + case 'g': + sm->gflag = true; + need_hint = true; + break; + case 'i': + sm->iflag = true; + break; + case 'R': + sm->Rflag = true; + need_random = true; + break; + case 'M': + initialise_months(); + sm->Mflag = true; + need_hint = true; + break; + case 'n': + sm->nflag = true; + need_hint = true; + print_symbols_on_debug = true; + break; + case 'r': + sm->rflag = true; + break; + case 'V': + sm->Vflag = true; + break; + case 'h': + sm->hflag = true; + need_hint = true; + print_symbols_on_debug = true; + break; + default: + return false; + } + sort_opts_vals.complex_sort = true; + sm->func = get_sort_func(sm); + } + return (true); +} + +/* + * Parse POS in -k option. + */ +static int +parse_pos(const char *s, struct key_specs *ks, bool *mef_flags, bool second) +{ + regmatch_t pmatch[4]; + regex_t re; + char *c, *f; + const char *sregexp = "^([0-9]+)(\\.[0-9]+)?([bdfirMngRhV]+)?$"; + size_t len, nmatch; + int ret; + + ret = -1; + nmatch = 4; + c = f = NULL; + + if (regcomp(&re, sregexp, REG_EXTENDED) != 0) + return (-1); + + if (regexec(&re, s, nmatch, pmatch, 0) != 0) + goto end; + + if (pmatch[0].rm_eo <= pmatch[0].rm_so) + goto end; + + if (pmatch[1].rm_eo <= pmatch[1].rm_so) + goto end; + + len = pmatch[1].rm_eo - pmatch[1].rm_so; + f = sort_malloc((len + 1) * sizeof(char)); + + strncpy(f, s + pmatch[1].rm_so, len); + f[len] = '\0'; + + if (second) { + errno = 0; + ks->f2 = (size_t) strtoul(f, NULL, 10); + if (errno != 0) + errx(2, "%s: -k", strerror(errno)); + if (ks->f2 == 0) { + warn(getstr(5)); + goto end; + } + } else { + errno = 0; + ks->f1 = (size_t) strtoul(f, NULL, 10); + if (errno != 0) + errx(2, "%s: -k", strerror(errno)); + if (ks->f1 == 0) { + warn(getstr(5)); + goto end; + } + } + + if (pmatch[2].rm_eo > pmatch[2].rm_so) { + len = pmatch[2].rm_eo - pmatch[2].rm_so - 1; + c = sort_malloc((len + 1) * sizeof(char)); + + strncpy(c, s + pmatch[2].rm_so + 1, len); + c[len] = '\0'; + + if (second) { + errno = 0; + ks->c2 = (size_t) strtoul(c, NULL, 10); + if (errno != 0) + errx(2, "%s: -k", strerror(errno)); + } else { + errno = 0; + ks->c1 = (size_t) strtoul(c, NULL, 10); + if (errno != 0) + errx(2, "%s: -k", strerror(errno)); + if (ks->c1 == 0) { + warn(getstr(6)); + goto end; + } + } + } else { + if (second) + ks->c2 = 0; + else + ks->c1 = 1; + } + + if (pmatch[3].rm_eo > pmatch[3].rm_so) { + regoff_t i = 0; + + for (i = pmatch[3].rm_so; i < pmatch[3].rm_eo; i++) { + check_mutually_exclusive_flags(s[i], mef_flags); + if (s[i] == 'b') { + if (second) + ks->pos2b = true; + else + ks->pos1b = true; + } else if (!set_sort_modifier(&(ks->sm), s[i])) + goto end; + } + } + + ret = 0; + +end: + + if (c) + sort_free(c); + if (f) + sort_free(f); + regfree(&re); + + return (ret); +} + +/* + * Parse -k option value. + */ +static int +parse_k(const char *s, struct key_specs *ks) +{ + int ret = -1; + bool mef_flags[NUMBER_OF_MUTUALLY_EXCLUSIVE_FLAGS] = + { false, false, false, false, false, false }; + + if (s && *s) { + char *sptr; + + sptr = strchr(s, ','); + if (sptr) { + size_t size1; + char *pos1, *pos2; + + size1 = sptr - s; + + if (size1 < 1) + return (-1); + pos1 = sort_malloc((size1 + 1) * sizeof(char)); + + strncpy(pos1, s, size1); + pos1[size1] = '\0'; + + ret = parse_pos(pos1, ks, mef_flags, false); + + sort_free(pos1); + if (ret < 0) + return (ret); + + pos2 = sort_strdup(sptr + 1); + ret = parse_pos(pos2, ks, mef_flags, true); + sort_free(pos2); + } else + ret = parse_pos(s, ks, mef_flags, false); + } + + return (ret); +} + +/* + * Parse POS in +POS -POS option. + */ +static int +parse_pos_obs(const char *s, int *nf, int *nc, char* sopts) +{ + regex_t re; + regmatch_t pmatch[4]; + char *c, *f; + const char *sregexp = "^([0-9]+)(\\.[0-9]+)?([A-Za-z]+)?$"; + int ret; + size_t len, nmatch; + + ret = -1; + nmatch = 4; + c = f = NULL; + *nc = *nf = 0; + + if (regcomp(&re, sregexp, REG_EXTENDED) != 0) + return (-1); + + if (regexec(&re, s, nmatch, pmatch, 0) != 0) + goto end; + + if (pmatch[0].rm_eo <= pmatch[0].rm_so) + goto end; + + if (pmatch[1].rm_eo <= pmatch[1].rm_so) + goto end; + + len = pmatch[1].rm_eo - pmatch[1].rm_so; + f = sort_malloc((len + 1) * sizeof(char)); + + strncpy(f, s + pmatch[1].rm_so, len); + f[len] = '\0'; + + errno = 0; + *nf = (size_t) strtoul(f, NULL, 10); + if (errno != 0) + errx(2, getstr(11)); + + if (pmatch[2].rm_eo > pmatch[2].rm_so) { + len = pmatch[2].rm_eo - pmatch[2].rm_so - 1; + c = sort_malloc((len + 1) * sizeof(char)); + + strncpy(c, s + pmatch[2].rm_so + 1, len); + c[len] = '\0'; + + errno = 0; + *nc = (size_t) strtoul(c, NULL, 10); + if (errno != 0) + errx(2, getstr(11)); + } + + if (pmatch[3].rm_eo > pmatch[3].rm_so) { + + len = pmatch[3].rm_eo - pmatch[3].rm_so; + + strncpy(sopts, s + pmatch[3].rm_so, len); + sopts[len] = '\0'; + } + + ret = 0; + +end: + if (c) + sort_free(c); + if (f) + sort_free(f); + regfree(&re); + + return (ret); +} + +/* + * "Translate" obsolete +POS1 -POS2 syntax into new -kPOS1,POS2 syntax + */ +void +fix_obsolete_keys(int *argc, char **argv) +{ + char sopt[129]; + + for (int i = 1; i < *argc; i++) { + char *arg1; + + arg1 = argv[i]; + + if (strlen(arg1) > 1 && arg1[0] == '+') { + int c1, f1; + char sopts1[128]; + + sopts1[0] = 0; + c1 = f1 = 0; + + if (parse_pos_obs(arg1 + 1, &f1, &c1, sopts1) < 0) + continue; + else { + f1 += 1; + c1 += 1; + if (i + 1 < *argc) { + char *arg2 = argv[i + 1]; + + if (strlen(arg2) > 1 && + arg2[0] == '-') { + int c2, f2; + char sopts2[128]; + + sopts2[0] = 0; + c2 = f2 = 0; + + if (parse_pos_obs(arg2 + 1, + &f2, &c2, sopts2) >= 0) { + if (c2 > 0) + f2 += 1; + sprintf(sopt, "-k%d.%d%s,%d.%d%s", + f1, c1, sopts1, f2, c2, sopts2); + argv[i] = sort_strdup(sopt); + for (int j = i + 1; j + 1 < *argc; j++) + argv[j] = argv[j + 1]; + *argc -= 1; + continue; + } + } + } + sprintf(sopt, "-k%d.%d", f1, c1); + argv[i] = sort_strdup(sopt); + } + } + } +} + +/* + * Set random seed + */ +static void +set_random_seed(void) +{ + if (need_random) { + + if (strcmp(random_source, DEFAULT_RANDOM_SORT_SEED_FILE) == 0) { + FILE* fseed; + MD5_CTX ctx; + char rsd[MAX_DEFAULT_RANDOM_SEED_DATA_SIZE]; + size_t sz = 0; + + fseed = openfile(random_source, "r"); + while (!feof(fseed)) { + int cr; + + cr = fgetc(fseed); + if (cr == EOF) + break; + + rsd[sz++] = (char) cr; + + if (sz >= MAX_DEFAULT_RANDOM_SEED_DATA_SIZE) + break; + } + + closefile(fseed, random_source); + + MD5Init(&ctx); + MD5Update(&ctx, rsd, sz); + + random_seed = MD5End(&ctx, NULL); + random_seed_size = strlen(random_seed); + + } else { + MD5_CTX ctx; + char *b; + + MD5Init(&ctx); + b = MD5File(random_source, NULL); + if (b == NULL) + err(2, NULL); + + random_seed = b; + random_seed_size = strlen(b); + } + + MD5Init(&md5_ctx); + if(random_seed_size>0) { + MD5Update(&md5_ctx, random_seed, random_seed_size); + } + } +} + +/* + * Main function. + */ +int +main(int argc, char **argv) +{ + char *outfile, *real_outfile; + int c, result; + bool mef_flags[NUMBER_OF_MUTUALLY_EXCLUSIVE_FLAGS] = + { false, false, false, false, false, false }; + + result = 0; + outfile = sort_strdup("-"); + real_outfile = NULL; + + struct sort_mods *sm = &default_sort_mods_object; + + init_tmp_files(); + + set_signal_handler(); + + set_hw_params(); + set_locale(); + set_tmpdir(); + set_sort_opts(); + +#if 0 + { + static int counter = 0; + char fn[128]; + sprintf(fn, "/var/tmp/debug.sort.%d", counter++); + FILE* f = fopen(fn, "w"); + fprintf(f, ">>sort>>"); + for (int i = 0; i < argc; i++) { + fprintf(f, "<%s>", argv[i]); + } + fprintf(f, "< 1) { + if (strcmp(optarg, "\\0")) { + errx(2, "%s: %s\n", + strerror(EINVAL), optarg); + } + *optarg = 0; + } + sort_opts_vals.tflag = true; + sort_opts_vals.field_sep = btowc(optarg[0]); + if (sort_opts_vals.field_sep == WEOF) { + errno = EINVAL; + err(2, NULL); + } + if (!gnusort_numeric_compatibility) { + if (symbol_decimal_point == sort_opts_vals.field_sep) + symbol_decimal_point = WEOF; + if (symbol_thousands_sep == sort_opts_vals.field_sep) + symbol_thousands_sep = WEOF; + if (symbol_negative_sign == sort_opts_vals.field_sep) + symbol_negative_sign = WEOF; + if (symbol_positive_sign == sort_opts_vals.field_sep) + symbol_positive_sign = WEOF; + } + break; + case 'u': + sort_opts_vals.uflag = true; + /* stable sort for the correct unique val */ + sort_opts_vals.sflag = true; + break; + case 'z': + sort_opts_vals.zflag = true; + break; + case SORT_OPT: + if (optarg) { + if (!strcmp(optarg, "general-numeric")) + set_sort_modifier(sm, 'g'); + else if (!strcmp(optarg, "human-numeric")) + set_sort_modifier(sm, 'h'); + else if (!strcmp(optarg, "numeric")) + set_sort_modifier(sm, 'n'); + else if (!strcmp(optarg, "month")) + set_sort_modifier(sm, 'M'); + else if (!strcmp(optarg, "random")) + set_sort_modifier(sm, 'R'); + else + unknown(optarg); + } + break; +#if defined(SORT_THREADS) + case NTHREADS_OPT: + nthreads = (size_t)(atoi(optarg)); + if (nthreads < 1) + nthreads = 1; + if (nthreads > 1024) + nthreads = 1024; + break; +#endif + case QSORT_OPT: + sort_opts_vals.sort_method = SORT_QSORT; + break; + case MERGESORT_OPT: + sort_opts_vals.sort_method = SORT_MERGESORT; + break; + case HEAPSORT_OPT: + sort_opts_vals.sort_method = SORT_HEAPSORT; + break; + case RADIXSORT_OPT: + sort_opts_vals.sort_method = SORT_RADIXSORT; + break; + case RANDOMSOURCE_OPT: + random_source = strdup(optarg); + break; + case COMPRESSPROGRAM_OPT: + compress_program = strdup(optarg); + break; + case FF_OPT: + read_fns_from_file0(optarg); + break; + case BS_OPT: + { + errno = 0; + long mof = strtol(optarg, NULL, 10); + if (errno != 0) + errx(2, "--batch-size: %s", + strerror(errno)); + if (mof >= 2) + max_open_files = (size_t) mof + 1; + } + break; + case VERSION_OPT: + printf("%s\n", VERSION); + exit(EXIT_SUCCESS); + /* NOTREACHED */ + break; + case DEBUG_OPT: + debug_sort = true; + break; + case HELP_OPT: + usage(false); + /* NOTREACHED */ + break; + default: + usage(true); + /* NOTREACHED */ + } + } + } + + argc -= optind; + argv += optind; + +#ifndef WITHOUT_NLS + catalog = catopen("sort", NL_CAT_LOCALE); +#endif + + if (sort_opts_vals.cflag && sort_opts_vals.mflag) + errx(1, getstr(1), 'm', 'c'); + +#ifndef WITHOUT_NLS + catclose(catalog); +#endif + + if (keys_num == 0) { + keys_num = 1; + keys = sort_realloc(keys, sizeof(struct key_specs)); + memset(&(keys[0]), 0, sizeof(struct key_specs)); + keys[0].c1 = 1; + keys[0].pos1b = default_sort_mods->bflag; + keys[0].pos2b = default_sort_mods->bflag; + memcpy(&(keys[0].sm), default_sort_mods, + sizeof(struct sort_mods)); + } + + for (size_t i = 0; i < keys_num; i++) { + struct key_specs *ks; + + ks = &(keys[i]); + + if (sort_modifier_empty(&(ks->sm)) && !(ks->pos1b) && + !(ks->pos2b)) { + ks->pos1b = sm->bflag; + ks->pos2b = sm->bflag; + memcpy(&(ks->sm), sm, sizeof(struct sort_mods)); + } + + ks->sm.func = get_sort_func(&(ks->sm)); + } + + if (argc_from_file0 >= 0) { + argc = argc_from_file0; + argv = argv_from_file0; + } + + if (debug_sort) { +#if defined(SORT_THREADS) + nthreads = 1; +#endif + printf("Using collate rules of %s locale\n", + setlocale(LC_COLLATE, NULL)); + if (byte_sort) + printf("Byte sort is used\n"); + if (print_symbols_on_debug) { + printf("Decimal Point: <%lc>\n", symbol_decimal_point); + if (symbol_thousands_sep) + printf("Thousands separator: <%lc>\n", + symbol_thousands_sep); + printf("Positive sign: <%lc>\n", symbol_positive_sign); + printf("Negative sign: <%lc>\n", symbol_negative_sign); + } + } + + set_random_seed(); + + /* Case when the outfile equals one of the input files: */ + if (strcmp(outfile, "-")) { + + for(int i = 0; i < argc; ++i) { + if (strcmp(argv[i], outfile) == 0) { + real_outfile = sort_strdup(outfile); + for(;;) { + char* tmp = sort_malloc(strlen(outfile) + + strlen(".tmp") + 1); + + strcpy(tmp, outfile); + strcpy(tmp + strlen(tmp), ".tmp"); + sort_free(outfile); + outfile = tmp; + if (access(outfile, F_OK) < 0) + break; + } + tmp_file_atexit(outfile); + } + } + } + + if (!sort_opts_vals.cflag && !sort_opts_vals.mflag) { + struct file_list fl; + struct sort_list list; + + sort_list_init(&list); + file_list_init(&fl, true); + + if (argc < 1) + procfile("-", &list, &fl); + else { + while (argc > 0) { + procfile(*argv, &list, &fl); + --argc; + ++argv; + } + } + + if (fl.count < 1) + sort_list_to_file(&list, outfile); + else { + if (list.count > 0) { + char *flast = new_tmp_file_name(); + + sort_list_to_file(&list, flast); + file_list_add(&fl, flast, false); + } + merge_files(&fl, outfile); + } + + file_list_clean(&fl); + + /* + * We are about to exit the program, so we can ignore + * the clean-up for speed + * + * sort_list_clean(&list); + */ + + } else if (sort_opts_vals.cflag) { + result = (argc == 0) ? (check("-")) : (check(*argv)); + } else if (sort_opts_vals.mflag) { + struct file_list fl; + + file_list_init(&fl, false); + file_list_populate(&fl, argc, argv, true); + merge_files(&fl, outfile); + file_list_clean(&fl); + } + + if (real_outfile) { + unlink(real_outfile); + if (rename(outfile, real_outfile) < 0) + err(2, NULL); + sort_free(real_outfile); + } + + sort_free(outfile); + + return (result); +} diff --git a/usr.bin/sort/sort.h b/usr.bin/sort/sort.h new file mode 100644 index 000000000000..9d4d1ed8f799 --- /dev/null +++ b/usr.bin/sort/sort.h @@ -0,0 +1,129 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2009 Gabor Kovesdan + * Copyright (C) 2012 Oleg Moskalenko + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if !defined(__BSD_SORT_H__) +#define __BSD_SORT_H__ + +#include +#include +#include +#include +#include + +#include +#include + +#define VERSION "2.3-FreeBSD" + +#define UNUSED_ARG(A) do { A=A; } while(0) + +#ifdef WITHOUT_NLS +#define getstr(n) nlsstr[n] +#else +#include + +extern nl_catd catalog; +#define getstr(n) catgets(catalog, 1, n, nlsstr[n]) +#endif + +extern const char *nlsstr[]; + +#if defined(SORT_THREADS) +extern size_t ncpu; +extern size_t nthreads; +#endif + +/* + * If true, we output some debug information. + */ +extern bool debug_sort; + +/* + * MD5 context for random hash function + */ +extern MD5_CTX md5_ctx; + +/* + * sort.c + */ + +/* + * This structure holds main sort options which are NOT affecting the sort ordering. + */ +struct sort_opts +{ + wchar_t field_sep; + int sort_method; + bool cflag; + bool csilentflag; + bool kflag; + bool mflag; + bool sflag; + bool uflag; + bool zflag; + bool tflag; + bool complex_sort; +}; + +/* + * Key value structure forward declaration + */ +struct key_value; + +/* + * Cmp function + */ +typedef int (*cmpcoll_t)(struct key_value *kv1, struct key_value *kv2, size_t offset); + +/* + * This structure holds "sort modifiers" - options which are affecting the sort ordering. + */ +struct sort_mods +{ + cmpcoll_t func; + bool bflag; + bool dflag; + bool fflag; + bool gflag; + bool iflag; + bool Mflag; + bool nflag; + bool rflag; + bool Rflag; + bool Vflag; + bool hflag; +}; + +extern bool need_hint; + +extern struct sort_opts sort_opts_vals; + +extern struct sort_mods * const default_sort_mods; + +#endif /* __BSD_SORT_H__ */ diff --git a/usr.bin/sort/vsort.c b/usr.bin/sort/vsort.c new file mode 100644 index 000000000000..593e54c57b54 --- /dev/null +++ b/usr.bin/sort/vsort.c @@ -0,0 +1,265 @@ +/*- + * Copyright (C) 2012 Oleg Moskalenko + * Copyright (C) 2012 Gabor Kovesdan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include + +#include "sort.h" +#include "vsort.h" + +static inline bool +isdigit_clocale(wchar_t c) +{ + + return (c >= L'0' && c <= L'9'); +} + +static inline bool +isalpha_clocale(wchar_t c) +{ + + return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z')); +} + +static inline bool +isalnum_clocale(wchar_t c) +{ + + return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') || + (c >= L'0' && c <= L'9')); +} + +/* + * Find string suffix of format: (\.[A-Za-z~][A-Za-z0-9~]*)*$ + * Set length of string before suffix. + */ +static void +find_suffix(bwstring_iterator si, bwstring_iterator se, size_t *len) +{ + wchar_t c; + size_t clen; + bool expect_alpha, sfx; + + sfx = false; + expect_alpha = false; + *len = 0; + clen = 0; + + while ((si < se) && (c = bws_get_iter_value(si))) { + if (expect_alpha) { + expect_alpha = false; + if (!isalpha_clocale(c) && (c != L'~')) + sfx = false; + } else if (c == L'.') { + expect_alpha = true; + if (!sfx) { + sfx = true; + *len = clen; + } + } else if (!isalnum_clocale(c) && (c != L'~')) + sfx = false; + + si = bws_iterator_inc(si, 1); + ++clen; + } + + /* This code must be here to make the implementation compatible + * with WORDING of GNU sort documentation. + * But the GNU sort implementation is not following its own + * documentation. GNU sort allows empty file extensions + * (just dot with nothing after); but the regular expression in + * their documentation does not allow empty file extensions. + * We chose to make our implementation compatible with GNU sort + * implementation. If they will ever fix their bug, this code + * must be uncommented. Or they may choose to fix the info page, + * then the code stays commented. + * + if (expect_alpha) + sfx = false; + */ + + if (!sfx) + *len = clen; +} + +static inline int +cmp_chars(wchar_t c1, wchar_t c2) +{ + + if (c1 == c2) + return (0); + + if (c1 == L'~') + return (-1); + if (c2 == L'~') + return (+1); + + if (isdigit_clocale(c1) || !c1) + return ((isdigit_clocale(c2) || !c2) ? 0 : -1); + + if (isdigit_clocale(c2) || !c2) + return (+1); + + if (isalpha_clocale(c1)) + return ((isalpha_clocale(c2)) ? ((int) c1 - (int) c2) : -1); + + if (isalpha_clocale(c2)) + return (+1); + + return ((int) c1 - (int) c2); +} + +static int +cmpversions(bwstring_iterator si1, bwstring_iterator se1, + bwstring_iterator si2, bwstring_iterator se2) +{ + int cmp, diff; + + while ((si1 < se1) || (si2 < se2)) { + diff = 0; + + while (((si1 < se1) && + !isdigit_clocale(bws_get_iter_value(si1))) || + ((si2 < se2) && !isdigit_clocale(bws_get_iter_value(si2)))) { + wchar_t c1, c2; + + c1 = (si1 < se1) ? bws_get_iter_value(si1) : 0; + c2 = (si2 < se2) ? bws_get_iter_value(si2) : 0; + + cmp = cmp_chars(c1, c2); + if (cmp) + return (cmp); + + if (si1 < se1) + si1 = bws_iterator_inc(si1, 1); + if (si2 < se2) + si2 = bws_iterator_inc(si2, 1); + } + + while (bws_get_iter_value(si1) == L'0') + si1 = bws_iterator_inc(si1, 1); + + while (bws_get_iter_value(si2) == L'0') + si2 = bws_iterator_inc(si2, 1); + + while (isdigit_clocale(bws_get_iter_value(si1)) && + isdigit_clocale(bws_get_iter_value(si2))) { + if (!diff) + diff = ((int)bws_get_iter_value(si1) - + (int)bws_get_iter_value(si2)); + si1 = bws_iterator_inc(si1, 1); + si2 = bws_iterator_inc(si2, 1); + } + + if (isdigit_clocale(bws_get_iter_value(si1))) + return (1); + + if (isdigit_clocale(bws_get_iter_value(si2))) + return (-1); + + if (diff) + return (diff); + } + + return (0); +} + +/* + * Compare two version strings + */ +int +vcmp(struct bwstring *s1, struct bwstring *s2) +{ + bwstring_iterator si1, si2; + wchar_t c1, c2; + size_t len1, len2, slen1, slen2; + int cmp_bytes, cmp_res; + + if (s1 == s2) + return (0); + + cmp_bytes = bwscmp(s1, s2, 0); + if (cmp_bytes == 0) + return (0); + + len1 = slen1 = BWSLEN(s1); + len2 = slen2 = BWSLEN(s2); + + if (slen1 < 1) + return (-1); + if (slen2 < 1) + return (+1); + + si1 = bws_begin(s1); + si2 = bws_begin(s2); + + c1 = bws_get_iter_value(si1); + c2 = bws_get_iter_value(si2); + + if (c1 == L'.' && (slen1 == 1)) + return (-1); + + if (c2 == L'.' && (slen2 == 1)) + return (+1); + + if (slen1 == 2 && c1 == L'.' && + bws_get_iter_value(bws_iterator_inc(si1, 1)) == L'.') + return (-1); + if (slen2 == 2 && c2 == L'.' && + bws_get_iter_value(bws_iterator_inc(si2, 1)) == L'.') + return (+1); + + if (c1 == L'.' && c2 != L'.') + return (-1); + if (c1 != L'.' && c2 == L'.') + return (+1); + + if (c1 == L'.' && c2 == L'.') { + si1 = bws_iterator_inc(si1, 1); + si2 = bws_iterator_inc(si2, 1); + } + + find_suffix(si1, bws_end(s1), &len1); + find_suffix(si2, bws_end(s2), &len2); + + if ((len1 == len2) && (bws_iterator_cmp(si1, si2, len1) == 0)) + return (cmp_bytes); + + cmp_res = cmpversions(si1, bws_iterator_inc(si1, len1), si2, + bws_iterator_inc(si2, len2)); + + if (cmp_res == 0) + cmp_res = cmp_bytes; + + return (cmp_res); +} diff --git a/usr.bin/sort/vsort.h b/usr.bin/sort/vsort.h new file mode 100644 index 000000000000..facd075113ee --- /dev/null +++ b/usr.bin/sort/vsort.h @@ -0,0 +1,37 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2012 Oleg Moskalenko + * Copyright (C) 2012 Gabor Kovesdan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#if !defined(__VSORT_H__) +#define _VSORT_H__ + +#include "bwstring.h" + +int vcmp(struct bwstring *s1, struct bwstring *s2); + +#endif