Avoid code duplication and implement bitcount32() function in systm.h only.

Reviewed by:	cperciva
MFC after:	3 days
This commit is contained in:
Pawel Jakub Dawidek 2005-08-19 22:10:19 +00:00
parent ad7c49168f
commit a95452ee8d
5 changed files with 55 additions and 113 deletions

View File

@ -812,58 +812,6 @@ smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
ia32_pause();
}
/*
* This is about as magic as it gets. fortune(1) has got similar code
* for reversing bits in a word. Who thinks up this stuff??
*
* Yes, it does appear to be consistently faster than:
* while (i = ffs(m)) {
* m >>= i;
* bits++;
* }
* and
* while (lsb = (m & -m)) { // This is magic too
* m &= ~lsb; // or: m ^= lsb
* bits++;
* }
* Both of these latter forms do some very strange things on gcc-3.1 with
* -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
* There is probably an SSE or MMX popcnt instruction.
*
* I wonder if this should be in libkern?
*
* XXX Stop the presses! Another one:
* static __inline u_int32_t
* popcnt1(u_int32_t v)
* {
* v -= ((v >> 1) & 0x55555555);
* v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
* v = (v + (v >> 4)) & 0x0F0F0F0F;
* return (v * 0x01010101) >> 24;
* }
* The downside is that it has a multiply. With a pentium3 with
* -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
* an imull, and in that case it is faster. In most other cases
* it appears slightly slower.
*
* Another variant (also from fortune):
* #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
* #define BX_(x) ((x) - (((x)>>1)&0x77777777) \
* - (((x)>>2)&0x33333333) \
* - (((x)>>3)&0x11111111))
*/
static __inline u_int32_t
popcnt(u_int32_t m)
{
m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
return m;
}
static void
smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
{
@ -878,7 +826,7 @@ smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offse
mask &= ~PCPU_GET(cpumask);
if (mask == 0)
return;
ncpu = popcnt(mask);
ncpu = bitcount32(mask);
if (ncpu > othercpus) {
/* XXX this should be a panic offence */
printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",

View File

@ -792,7 +792,7 @@ g_stripe_create(struct g_class *mp, const struct g_stripe_metadata *md,
sc->sc_id = md->md_id;
sc->sc_stripesize = md->md_stripesize;
sc->sc_stripebits = BITCOUNT(sc->sc_stripesize - 1);
sc->sc_stripebits = bitcount32(sc->sc_stripesize - 1);
sc->sc_ndisks = md->md_all;
sc->sc_disks = malloc(sizeof(struct g_consumer *) * sc->sc_ndisks,
M_STRIPE, M_WAITOK | M_ZERO);

View File

@ -120,10 +120,4 @@ stripe_metadata_decode(const u_char *data, struct g_stripe_metadata *md)
md->md_provsize = le64dec(data + 64);
}
#ifndef BITCOUNT
#define BITCOUNT(x) (((BX_(x) + (BX_(x) >> 4)) & 0x0F0F0F0F) % 255)
#define BX_(x) ((x) - (((x) >> 1) & 0x77777777) - \
(((x) >> 2) & 0x33333333) - (((x) >> 3) & 0x11111111))
#endif
#endif /* _G_STRIPE_H_ */

View File

@ -1008,58 +1008,6 @@ smp_tlb_shootdown(u_int vector, vm_offset_t addr1, vm_offset_t addr2)
ia32_pause();
}
/*
* This is about as magic as it gets. fortune(1) has got similar code
* for reversing bits in a word. Who thinks up this stuff??
*
* Yes, it does appear to be consistently faster than:
* while (i = ffs(m)) {
* m >>= i;
* bits++;
* }
* and
* while (lsb = (m & -m)) { // This is magic too
* m &= ~lsb; // or: m ^= lsb
* bits++;
* }
* Both of these latter forms do some very strange things on gcc-3.1 with
* -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
* There is probably an SSE or MMX popcnt instruction.
*
* I wonder if this should be in libkern?
*
* XXX Stop the presses! Another one:
* static __inline u_int32_t
* popcnt1(u_int32_t v)
* {
* v -= ((v >> 1) & 0x55555555);
* v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
* v = (v + (v >> 4)) & 0x0F0F0F0F;
* return (v * 0x01010101) >> 24;
* }
* The downside is that it has a multiply. With a pentium3 with
* -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
* an imull, and in that case it is faster. In most other cases
* it appears slightly slower.
*
* Another variant (also from fortune):
* #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
* #define BX_(x) ((x) - (((x)>>1)&0x77777777) \
* - (((x)>>2)&0x33333333) \
* - (((x)>>3)&0x11111111))
*/
static __inline u_int32_t
popcnt(u_int32_t m)
{
m = (m & 0x55555555) + ((m & 0xaaaaaaaa) >> 1);
m = (m & 0x33333333) + ((m & 0xcccccccc) >> 2);
m = (m & 0x0f0f0f0f) + ((m & 0xf0f0f0f0) >> 4);
m = (m & 0x00ff00ff) + ((m & 0xff00ff00) >> 8);
m = (m & 0x0000ffff) + ((m & 0xffff0000) >> 16);
return m;
}
static void
smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offset_t addr2)
{
@ -1074,7 +1022,7 @@ smp_targeted_tlb_shootdown(u_int mask, u_int vector, vm_offset_t addr1, vm_offse
mask &= ~PCPU_GET(cpumask);
if (mask == 0)
return;
ncpu = popcnt(mask);
ncpu = bitcount32(mask);
if (ncpu > othercpus) {
/* XXX this should be a panic offence */
printf("SMP: tlb shootdown to %d other cpus (only have %d)\n",

View File

@ -330,4 +330,56 @@ int alloc_unr(struct unrhdr *uh);
int alloc_unrl(struct unrhdr *uh);
void free_unr(struct unrhdr *uh, u_int item);
/*
* This is about as magic as it gets. fortune(1) has got similar code
* for reversing bits in a word. Who thinks up this stuff??
*
* Yes, it does appear to be consistently faster than:
* while (i = ffs(m)) {
* m >>= i;
* bits++;
* }
* and
* while (lsb = (m & -m)) { // This is magic too
* m &= ~lsb; // or: m ^= lsb
* bits++;
* }
* Both of these latter forms do some very strange things on gcc-3.1 with
* -mcpu=pentiumpro and/or -march=pentiumpro and/or -O or -O2.
* There is probably an SSE or MMX popcnt instruction.
*
* I wonder if this should be in libkern?
*
* XXX Stop the presses! Another one:
* static __inline u_int32_t
* popcnt1(u_int32_t v)
* {
* v -= ((v >> 1) & 0x55555555);
* v = (v & 0x33333333) + ((v >> 2) & 0x33333333);
* v = (v + (v >> 4)) & 0x0F0F0F0F;
* return (v * 0x01010101) >> 24;
* }
* The downside is that it has a multiply. With a pentium3 with
* -mcpu=pentiumpro and -march=pentiumpro then gcc-3.1 will use
* an imull, and in that case it is faster. In most other cases
* it appears slightly slower.
*
* Another variant (also from fortune):
* #define BITCOUNT(x) (((BX_(x)+(BX_(x)>>4)) & 0x0F0F0F0F) % 255)
* #define BX_(x) ((x) - (((x)>>1)&0x77777777) \
* - (((x)>>2)&0x33333333) \
* - (((x)>>3)&0x11111111))
*/
static __inline uint32_t
bitcount32(uint32_t x)
{
x = (x & 0x55555555) + ((x & 0xaaaaaaaa) >> 1);
x = (x & 0x33333333) + ((x & 0xcccccccc) >> 2);
x = (x & 0x0f0f0f0f) + ((x & 0xf0f0f0f0) >> 4);
x = (x & 0x00ff00ff) + ((x & 0xff00ff00) >> 8);
x = (x & 0x0000ffff) + ((x & 0xffff0000) >> 16);
return (x);
}
#endif /* !_SYS_SYSTM_H_ */