Reimplement strlen
The previous code neglected to use primitives which can find the end of the string without having to branch on every character. While here augment the somewhat misleading commentary -- strlen as implemented here leaves performance on the table, especially so for userspace. Every arch should get a dedicated variant instead. In the meantime this commit lessens the problem. Tested with glibc test suite. Naive test just calling strlen in a loop on Haswell (ops/s): $(perl -e "print 'A' x 3"): before: 211198039 after: 338626619 $(perl -e "print 'A' x 100"): before: 83151997 after: 98285919
This commit is contained in:
parent
cb984c62d7
commit
710e45c4b8
@ -35,10 +35,6 @@ __FBSDID("$FreeBSD$");
|
|||||||
/*
|
/*
|
||||||
* Portable strlen() for 32-bit and 64-bit systems.
|
* Portable strlen() for 32-bit and 64-bit systems.
|
||||||
*
|
*
|
||||||
* Rationale: it is generally much more efficient to do word length
|
|
||||||
* operations and avoid branches on modern computer systems, as
|
|
||||||
* compared to byte-length operations with a lot of branches.
|
|
||||||
*
|
|
||||||
* The expression:
|
* The expression:
|
||||||
*
|
*
|
||||||
* ((x - 0x01....01) & ~x & 0x80....80)
|
* ((x - 0x01....01) & ~x & 0x80....80)
|
||||||
@ -46,18 +42,13 @@ __FBSDID("$FreeBSD$");
|
|||||||
* would evaluate to a non-zero value iff any of the bytes in the
|
* would evaluate to a non-zero value iff any of the bytes in the
|
||||||
* original word is zero.
|
* original word is zero.
|
||||||
*
|
*
|
||||||
* On multi-issue processors, we can divide the above expression into:
|
|
||||||
* a) (x - 0x01....01)
|
|
||||||
* b) (~x & 0x80....80)
|
|
||||||
* c) a & b
|
|
||||||
*
|
|
||||||
* Where, a) and b) can be partially computed in parallel.
|
|
||||||
*
|
|
||||||
* The algorithm above is found on "Hacker's Delight" by
|
* The algorithm above is found on "Hacker's Delight" by
|
||||||
* Henry S. Warren, Jr.
|
* Henry S. Warren, Jr.
|
||||||
|
*
|
||||||
|
* Note: this leaves performance on the table and each architecture
|
||||||
|
* would be best served with a tailor made routine instead.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Magic numbers for the algorithm */
|
|
||||||
#if LONG_BIT == 32
|
#if LONG_BIT == 32
|
||||||
static const unsigned long mask01 = 0x01010101;
|
static const unsigned long mask01 = 0x01010101;
|
||||||
static const unsigned long mask80 = 0x80808080;
|
static const unsigned long mask80 = 0x80808080;
|
||||||
@ -70,62 +61,45 @@ static const unsigned long mask80 = 0x8080808080808080;
|
|||||||
|
|
||||||
#define LONGPTR_MASK (sizeof(long) - 1)
|
#define LONGPTR_MASK (sizeof(long) - 1)
|
||||||
|
|
||||||
/*
|
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||||
* Helper macro to return string length if we caught the zero
|
#define FINDZERO __builtin_ctzl
|
||||||
* byte.
|
#else
|
||||||
*/
|
#define FINDZERO __builtin_clzl
|
||||||
#define testbyte(x) \
|
#endif
|
||||||
do { \
|
|
||||||
if (p[x] == '\0') \
|
|
||||||
return (p - str + x); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
size_t
|
size_t
|
||||||
strlen(const char *str)
|
strlen(const char *str)
|
||||||
{
|
{
|
||||||
const char *p;
|
|
||||||
const unsigned long *lp;
|
const unsigned long *lp;
|
||||||
|
unsigned long mask;
|
||||||
long va, vb;
|
long va, vb;
|
||||||
|
long val;
|
||||||
|
|
||||||
/*
|
lp = (unsigned long *) (uintptr_t) str;
|
||||||
* Before trying the hard (unaligned byte-by-byte access) way
|
if ((uintptr_t)lp & LONGPTR_MASK) {
|
||||||
* to figure out whether there is a nul character, try to see
|
lp = (__typeof(lp)) ((uintptr_t)lp & ~LONGPTR_MASK);
|
||||||
* if there is a nul character is within this accessible word
|
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||||
* first.
|
mask = ~(~0UL << (((uintptr_t)str & LONGPTR_MASK) << 3));
|
||||||
*
|
#else
|
||||||
* p and (p & ~LONGPTR_MASK) must be equally accessible since
|
mask = ~(~0UL >> (((uintptr_t)str & LONGPTR_MASK) << 3));
|
||||||
* they always fall in the same memory page, as long as page
|
#endif
|
||||||
* boundaries is integral multiple of word size.
|
val = *lp | mask;
|
||||||
*/
|
va = (val - mask01);
|
||||||
lp = (const unsigned long *)((uintptr_t)str & ~LONGPTR_MASK);
|
vb = ((~val) & mask80);
|
||||||
va = (*lp - mask01);
|
if (va & vb) {
|
||||||
vb = ((~*lp) & mask80);
|
return ((const char *)lp - str + (FINDZERO(va & vb) >> 3));
|
||||||
lp++;
|
}
|
||||||
if (va & vb)
|
lp++;
|
||||||
/* Check if we have \0 in the first part */
|
}
|
||||||
for (p = str; p < (const char *)lp; p++)
|
|
||||||
if (*p == '\0')
|
|
||||||
return (p - str);
|
|
||||||
|
|
||||||
/* Scan the rest of the string using word sized operation */
|
|
||||||
for (; ; lp++) {
|
for (; ; lp++) {
|
||||||
va = (*lp - mask01);
|
va = (*lp - mask01);
|
||||||
vb = ((~*lp) & mask80);
|
vb = ((~*lp) & mask80);
|
||||||
if (va & vb) {
|
if (va & vb) {
|
||||||
p = (const char *)(lp);
|
return ((const char *)lp - str + (FINDZERO(va & vb) >> 3));
|
||||||
testbyte(0);
|
|
||||||
testbyte(1);
|
|
||||||
testbyte(2);
|
|
||||||
testbyte(3);
|
|
||||||
#if (LONG_BIT >= 64)
|
|
||||||
testbyte(4);
|
|
||||||
testbyte(5);
|
|
||||||
testbyte(6);
|
|
||||||
testbyte(7);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* NOTREACHED */
|
__builtin_unreachable();
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
@ -34,10 +34,6 @@ __FBSDID("$FreeBSD$");
|
|||||||
/*
|
/*
|
||||||
* Portable strlen() for 32-bit and 64-bit systems.
|
* Portable strlen() for 32-bit and 64-bit systems.
|
||||||
*
|
*
|
||||||
* Rationale: it is generally much more efficient to do word length
|
|
||||||
* operations and avoid branches on modern computer systems, as
|
|
||||||
* compared to byte-length operations with a lot of branches.
|
|
||||||
*
|
|
||||||
* The expression:
|
* The expression:
|
||||||
*
|
*
|
||||||
* ((x - 0x01....01) & ~x & 0x80....80)
|
* ((x - 0x01....01) & ~x & 0x80....80)
|
||||||
@ -45,18 +41,10 @@ __FBSDID("$FreeBSD$");
|
|||||||
* would evaluate to a non-zero value iff any of the bytes in the
|
* would evaluate to a non-zero value iff any of the bytes in the
|
||||||
* original word is zero.
|
* original word is zero.
|
||||||
*
|
*
|
||||||
* On multi-issue processors, we can divide the above expression into:
|
|
||||||
* a) (x - 0x01....01)
|
|
||||||
* b) (~x & 0x80....80)
|
|
||||||
* c) a & b
|
|
||||||
*
|
|
||||||
* Where, a) and b) can be partially computed in parallel.
|
|
||||||
*
|
|
||||||
* The algorithm above is found on "Hacker's Delight" by
|
* The algorithm above is found on "Hacker's Delight" by
|
||||||
* Henry S. Warren, Jr.
|
* Henry S. Warren, Jr.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Magic numbers for the algorithm */
|
|
||||||
#if LONG_BIT == 32
|
#if LONG_BIT == 32
|
||||||
static const unsigned long mask01 = 0x01010101;
|
static const unsigned long mask01 = 0x01010101;
|
||||||
static const unsigned long mask80 = 0x80808080;
|
static const unsigned long mask80 = 0x80808080;
|
||||||
@ -69,62 +57,45 @@ static const unsigned long mask80 = 0x8080808080808080;
|
|||||||
|
|
||||||
#define LONGPTR_MASK (sizeof(long) - 1)
|
#define LONGPTR_MASK (sizeof(long) - 1)
|
||||||
|
|
||||||
/*
|
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||||
* Helper macro to return string length if we caught the zero
|
#define FINDZERO __builtin_ctzl
|
||||||
* byte.
|
#else
|
||||||
*/
|
#define FINDZERO __builtin_clzl
|
||||||
#define testbyte(x) \
|
#endif
|
||||||
do { \
|
|
||||||
if (p[x] == '\0') \
|
|
||||||
return (p - str + x); \
|
|
||||||
} while (0)
|
|
||||||
|
|
||||||
size_t
|
size_t
|
||||||
(strlen)(const char *str)
|
(strlen)(const char *str)
|
||||||
{
|
{
|
||||||
const char *p;
|
|
||||||
const unsigned long *lp;
|
const unsigned long *lp;
|
||||||
|
unsigned long mask;
|
||||||
long va, vb;
|
long va, vb;
|
||||||
|
long val;
|
||||||
|
|
||||||
/*
|
lp = (unsigned long *) (uintptr_t) str;
|
||||||
* Before trying the hard (unaligned byte-by-byte access) way
|
if ((uintptr_t)lp & LONGPTR_MASK) {
|
||||||
* to figure out whether there is a nul character, try to see
|
lp = (__typeof(lp)) ((uintptr_t)lp & ~LONGPTR_MASK);
|
||||||
* if there is a nul character is within this accessible word
|
#if BYTE_ORDER == LITTLE_ENDIAN
|
||||||
* first.
|
mask = ~(~0UL << (((uintptr_t)str & LONGPTR_MASK) << 3));
|
||||||
*
|
#else
|
||||||
* p and (p & ~LONGPTR_MASK) must be equally accessible since
|
mask = ~(~0UL >> (((uintptr_t)str & LONGPTR_MASK) << 3));
|
||||||
* they always fall in the same memory page, as long as page
|
#endif
|
||||||
* boundaries is integral multiple of word size.
|
val = *lp | mask;
|
||||||
*/
|
va = (val - mask01);
|
||||||
lp = (const unsigned long *)((uintptr_t)str & ~LONGPTR_MASK);
|
vb = ((~val) & mask80);
|
||||||
va = (*lp - mask01);
|
if (va & vb) {
|
||||||
vb = ((~*lp) & mask80);
|
return ((const char *)lp - str + (FINDZERO(va & vb) >> 3));
|
||||||
lp++;
|
}
|
||||||
if (va & vb)
|
lp++;
|
||||||
/* Check if we have \0 in the first part */
|
}
|
||||||
for (p = str; p < (const char *)lp; p++)
|
|
||||||
if (*p == '\0')
|
|
||||||
return (p - str);
|
|
||||||
|
|
||||||
/* Scan the rest of the string using word sized operation */
|
|
||||||
for (; ; lp++) {
|
for (; ; lp++) {
|
||||||
va = (*lp - mask01);
|
va = (*lp - mask01);
|
||||||
vb = ((~*lp) & mask80);
|
vb = ((~*lp) & mask80);
|
||||||
if (va & vb) {
|
if (va & vb) {
|
||||||
p = (const char *)(lp);
|
return ((const char *)lp - str + (FINDZERO(va & vb) >> 3));
|
||||||
testbyte(0);
|
|
||||||
testbyte(1);
|
|
||||||
testbyte(2);
|
|
||||||
testbyte(3);
|
|
||||||
#if (LONG_BIT >= 64)
|
|
||||||
testbyte(4);
|
|
||||||
testbyte(5);
|
|
||||||
testbyte(6);
|
|
||||||
testbyte(7);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* NOTREACHED */
|
__builtin_unreachable();
|
||||||
return (0);
|
return (0);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user