Replace the current strspn() and strcspn() with significantly faster

implementations inspired by the ones in DragonFly.  Unlike the
DragonFly versions, these have a small data cache footprint, and my
tests show that they're never slower than the old code except when the
charset or the span is 0 or 1 characters.  This implementation is
generally faster than DragonFly until either the charset or the span
gets in the ballpark of 32 to 64 characters.
This commit is contained in:
das 2005-04-02 18:52:44 +00:00
parent 02f22e6e2c
commit 87aa297030
3 changed files with 121 additions and 93 deletions

View File

@ -1,9 +1,6 @@
/*-
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Chris Torek.
* Copyright (c) 2005 David Schultz <das@FreeBSD.ORG>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -13,18 +10,11 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@ -34,36 +24,49 @@
* SUCH DAMAGE.
*/
#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid[] = "@(#)strcspn.c 8.1 (Berkeley) 6/4/93";
#endif /* LIBC_SCCS and not lint */
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <limits.h>
#include <string.h>
/*
* Span the complement of string s2.
*/
size_t
strcspn(s1, s2)
const char *s1;
const char *s2;
{
const char *p, *spanp;
char c, sc;
#define IDX(c) ((u_char)(c) / LONG_BIT)
#define BIT(c) ((u_long)1 << ((u_char)(c) % LONG_BIT))
size_t
strcspn(const char *s, const char *charset)
{
/*
* Stop as soon as we find any character from s2. Note that there
* must be a NUL in s2; it suffices to stop when we find that, too.
* NB: idx and bit are temporaries whose use causes gcc 3.4.2 to
* generate better code. Without them, gcc gets a little confused.
*/
for (p = s1;;) {
c = *p++;
spanp = s2;
do {
if ((sc = *spanp++) == c)
return (p - 1 - s1);
} while (sc != 0);
const char *s1;
u_long bit;
u_long tbl[(UCHAR_MAX + 1) / LONG_BIT];
int idx;
if(*s == '\0')
return (0);
#if LONG_BIT == 64 /* always better to unroll on 64-bit architectures */
tbl[0] = 1;
tbl[3] = tbl[2] = tbl[1] = 0;
#else
for (tbl[0] = idx = 1; idx < sizeof(tbl) / sizeof(tbl[0]); idx++)
tbl[idx] = 0;
#endif
for (; *charset != '\0'; charset++) {
idx = IDX(*charset);
bit = BIT(*charset);
tbl[idx] |= bit;
}
/* NOTREACHED */
for(s1 = s; ; s1++) {
idx = IDX(*s1);
bit = BIT(*s1);
if ((tbl[idx] & bit) != 0)
break;
}
return (s1 - s);
}

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
/*-
* Copyright (c) 2005 David Schultz <das@FreeBSD.ORG>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -10,18 +10,11 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@ -31,32 +24,48 @@
* SUCH DAMAGE.
*/
#if defined(LIBC_SCCS) && !defined(lint)
static char sccsid[] = "@(#)strspn.c 8.1 (Berkeley) 6/4/93";
#endif /* LIBC_SCCS and not lint */
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
#include <sys/types.h>
#include <limits.h>
#include <string.h>
/*
* Span the string s2 (skip characters that are in s2).
*/
size_t
strspn(s1, s2)
const char *s1;
const char *s2;
{
const char *p = s1, *spanp;
char c, sc;
#define IDX(c) ((u_char)(c) / LONG_BIT)
#define BIT(c) ((u_long)1 << ((u_char)(c) % LONG_BIT))
size_t
strspn(const char *s, const char *charset)
{
/*
* Skip any characters in s2, excluding the terminating \0.
* NB: idx and bit are temporaries whose use causes gcc 3.4.2 to
* generate better code. Without them, gcc gets a little confused.
*/
cont:
c = *p++;
for (spanp = s2; (sc = *spanp++) != 0;)
if (sc == c)
goto cont;
return (p - 1 - s1);
const char *s1;
u_long bit;
u_long tbl[(UCHAR_MAX + 1) / LONG_BIT];
int idx;
if(*s == '\0')
return (0);
#if LONG_BIT == 64 /* always better to unroll on 64-bit architectures */
tbl[3] = tbl[2] = tbl[1] = tbl[0] = 0;
#else
for (idx = 0; idx < sizeof(tbl) / sizeof(tbl[0]); idx++)
tbl[idx] = 0;
#endif
for (; *charset != '\0'; charset++) {
idx = IDX(*charset);
bit = BIT(*charset);
tbl[idx] |= bit;
}
for(s1 = s; ; s1++) {
idx = IDX(*s1);
bit = BIT(*s1);
if ((tbl[idx] & bit) == 0)
break;
}
return (s1 - s);
}

View File

@ -1,6 +1,6 @@
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
/*-
* Copyright (c) 2005 David Schultz <das@FreeBSD.ORG>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@ -10,14 +10,11 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@ -31,25 +28,44 @@
__FBSDID("$FreeBSD$");
#include <sys/libkern.h>
#include <sys/limits.h>
#include <sys/types.h>
#define IDX(c) ((u_char)(c) / LONG_BIT)
#define BIT(c) ((u_long)1 << ((u_char)(c) % LONG_BIT))
/*
* Span the string s2 (skip characters that are in s2).
*/
size_t
strspn(s1, s2)
const char *s1;
const char *s2;
strspn(const char *s, const char *charset)
{
const char *p = s1, *spanp;
char c, sc;
/*
* Skip any characters in s2, excluding the terminating \0.
* NB: idx and bit are temporaries whose use causes gcc 3.4.2 to
* generate better code. Without them, gcc gets a little confused.
*/
cont:
c = *p++;
for (spanp = s2; (sc = *spanp++) != 0;)
if (sc == c)
goto cont;
return (p - 1 - s1);
const char *s1;
u_long bit;
u_long tbl[(UCHAR_MAX + 1) / LONG_BIT];
int idx;
if(*s == '\0')
return (0);
#if LONG_BIT == 64 /* always better to unroll on 64-bit architectures */
tbl[3] = tbl[2] = tbl[1] = tbl[0] = 0;
#else
for (idx = 0; idx < sizeof(tbl) / sizeof(tbl[0]); idx++)
tbl[idx] = 0;
#endif
for (; *charset != '\0'; charset++) {
idx = IDX(*charset);
bit = BIT(*charset);
tbl[idx] |= bit;
}
for(s1 = s; ; s1++) {
idx = IDX(*s1);
bit = BIT(*s1);
if ((tbl[idx] & bit) == 0)
break;
}
return (s1 - s);
}