Bring in dirhash, a simple hash-based lookup optimisation for large

directories. When enabled via "options UFS_DIRHASH", in-core hash
arrays are maintained for large directories. These allow all
directory operations to take place quickly instead of requiring
long linear searches. For now anyway, dirhash is not enabled by
default.

The in-core hash arrays have a memory requirement that is approximately
half the size of the size of the on-disk directory file. A number
of new sysctl variables allow control over which directories get
hashed and over the maximum amount of memory that dirhash will use:

  vfs.ufs.dirhash_minsize
    The minimum on-disk directory size for which hashing should be
    used. The default is 2560 (2.5k).

  vfs.ufs.dirhash_maxmem
    The system-wide maximum total memory to be used by dirhash data
    structures. The default is 2097152 (2MB).

The current amount of memory being used by dirhash is visible
through the read-only sysctl variable vfs.ufs.dirhash_maxmem.
Finally, some extra sanity checks that are enabled by default, but
which may have an impact on performance, can be disabled by setting
vfs.ufs.dirhash_docheck to 0.

Discussed on: -fs, -hackers
This commit is contained in:
Ian Dowse 2001-07-10 21:21:29 +00:00
parent 63b2f7728a
commit 9b5ad47fb7
Notes: svn2git 2020-12-20 02:59:44 +00:00
svn path=/head/; revision=79561
11 changed files with 1292 additions and 2 deletions

View File

@ -705,6 +705,10 @@ options UFS_EXTATTR_AUTOSTART
# See src/sys/ufs/ufs/README.acls for more information.
options UFS_ACL
# Directory hashing improves the speed of operations on very large
# directories at the expense of some memory.
options UFS_DIRHASH
# Make space in the kernel for a root filesystem on a md device.
# Define to the number of kilobytes to reserve for the filesystem.
options MD_ROOT_SIZE=10

View File

@ -1229,6 +1229,7 @@ ufs/ffs/ffs_vnops.c optional ffs
ufs/ffs/ffs_vnops.c optional ifs
ufs/ufs/ufs_acl.c standard
ufs/ufs/ufs_bmap.c standard
ufs/ufs/ufs_dirhash.c standard
ufs/ufs/ufs_extattr.c standard
ufs/ufs/ufs_ihash.c standard
ufs/ufs/ufs_inode.c standard

View File

@ -156,6 +156,9 @@ UFS_ACL opt_ufs.h
UFS_EXTATTR opt_ufs.h
UFS_EXTATTR_AUTOSTART opt_ufs.h
# Enable fast hash lookups for large directories on UFS-based filesystems.
UFS_DIRHASH opt_ufs.h
# The above static dependencies are planned removed, with a
# <filesystem>_ROOT option to control if it usable as root. This list
# allows these options to be present in config files already (though

View File

@ -94,6 +94,8 @@ struct inode {
ino_t i_ino; /* Inode number of found directory. */
u_int32_t i_reclen; /* Size of found directory entry. */
u_int32_t i_spare[4]; /* XXX actually non-spare (for ext2fs). */
struct dirhash *i_dirhash; /* Hashing for large directories */
/*
* The on-disk dinode itself.
*/

View File

@ -94,6 +94,8 @@ struct inode {
ino_t i_ino; /* Inode number of found directory. */
u_int32_t i_reclen; /* Size of found directory entry. */
u_int32_t i_spare[4]; /* XXX actually non-spare (for ext2fs). */
struct dirhash *i_dirhash; /* Hashing for large directories */
/*
* The on-disk dinode itself.
*/

View File

@ -705,6 +705,10 @@ options UFS_EXTATTR_AUTOSTART
# See src/sys/ufs/ufs/README.acls for more information.
options UFS_ACL
# Directory hashing improves the speed of operations on very large
# directories at the expense of some memory.
options UFS_DIRHASH
# Make space in the kernel for a root filesystem on a md device.
# Define to the number of kilobytes to reserve for the filesystem.
options MD_ROOT_SIZE=10

124
sys/ufs/ufs/dirhash.h Normal file
View File

@ -0,0 +1,124 @@
/*
* Copyright (c) 2001 Ian Dowse. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD$
*/
#ifndef _UFS_UFS_DIRHASH_H_
#define _UFS_UFS_DIRHASH_H_
/*
* For fast operations on large directories, we maintain a hash
* that maps the file name to the offset of the directory entry within
* the directory file.
*
* The hashing uses a dumb spillover to the next free slot on
* collisions, so we must keep the utilisation low to avoid
* long linear searches. Deleted entries that are not the last
* in a chain must be marked DIRHASH_DEL.
*
* We also maintain a information about free space in each block
* to speed up creations.
*/
#define DIRHASH_EMPTY (-1) /* entry unused */
#define DIRHASH_DEL (-2) /* deleted entry; may be part of chain */
#define DIRALIGN 4
#define DH_NFSTATS (DIRECTSIZ(MAXNAMLEN + 1) / DIRALIGN)
/* max DIRALIGN words in a directory entry */
/*
* Dirhash uses a score mechanism to achieve a hybrid between a
* least-recently-used and a least-often-used algorithm for entry
* recycling. The score is incremented when a directory is used, and
* decremented when the directory is a candidate for recycling. When
* the score reaches zero, the hash is recycled. Hashes are linked
* together on a TAILQ list, and hashes with higher scores filter
* towards the tail (most recently used) end of the list.
*
* New hash entries are given an inital score of DH_SCOREINIT and are
* placed at the most-recently-used end of the list. This helps a lot
* in the worst-case case scenario where every directory access is
* to a directory that is not hashed (i.e. the working set of hash
* candidates is much larger than the configured memry limit). In this
* case it limits the number of hash builds to 1/DH_SCOREINIT of the
* number of accesses.
*/
#define DH_SCOREINIT 8 /* initial dh_score when dirhash built */
#define DH_SCOREMAX 64 /* max dh_score value */
/*
* The main hash table has 2 levels. It is an array of pointers to
* blocks of DH_NBLKOFF offsets.
*/
#define DH_BLKOFFSHIFT 8
#define DH_NBLKOFF (1 << DH_BLKOFFSHIFT)
#define DH_BLKOFFMASK (DH_NBLKOFF - 1)
#define DH_ENTRY(dh, slot) \
((dh)->dh_hash[(slot) >> DH_BLKOFFSHIFT][(slot) & DH_BLKOFFMASK])
struct dirhash {
struct mtx dh_mtx; /* protects all fields except dh_list */
doff_t **dh_hash; /* the hash array (2-level) */
int dh_narrays; /* number of entries in dh_hash */
int dh_hlen; /* total slots in the 2-level hash array */
int dh_hused; /* entries in use */
/* Free space statistics. XXX assumes DIRBLKSIZ is 512. */
u_int8_t *dh_blkfree; /* free DIRALIGN words in each dir block */
int dh_nblk; /* size of dh_blkfree array */
int dh_dirblks; /* number of DIRBLKSIZ blocks in dir */
int dh_firstfree[DH_NFSTATS + 1]; /* first blk with N words free */
int dh_seqopt; /* sequential access optimisation enabled */
doff_t dh_seqoff; /* sequential access optimisation offset */
int dh_score; /* access count for this dirhash */
int dh_onlist; /* true if on the ufsdirhash_list chain */
/* Protected by ufsdirhash_mtx. */
TAILQ_ENTRY(dirhash) dh_list; /* chain of all dirhashes */
};
/*
* Dirhash functions.
*/
int ufsdirhash_build(struct inode *);
doff_t ufsdirhash_findfree(struct inode *, int, int *);
doff_t ufsdirhash_enduseful(struct inode *);
int ufsdirhash_lookup(struct inode *, char *, int, doff_t *, doff_t *);
void ufsdirhash_newblk(struct inode *, doff_t);
void ufsdirhash_add(struct inode *, struct direct *, doff_t);
void ufsdirhash_remove(struct inode *, struct direct *, doff_t);
void ufsdirhash_move(struct inode *, struct direct *, doff_t, doff_t);
void ufsdirhash_dirtrunc(struct inode *, doff_t);
void ufsdirhash_free(struct inode *);
void ufsdirhash_checkblock(struct inode *, char *, doff_t);
#endif /* !_UFS_UFS_DIRHASH_H_ */

View File

@ -94,6 +94,8 @@ struct inode {
ino_t i_ino; /* Inode number of found directory. */
u_int32_t i_reclen; /* Size of found directory entry. */
u_int32_t i_spare[4]; /* XXX actually non-spare (for ext2fs). */
struct dirhash *i_dirhash; /* Hashing for large directories */
/*
* The on-disk dinode itself.
*/

1049
sys/ufs/ufs/ufs_dirhash.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -53,6 +53,10 @@
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dir.h>
#include <ufs/ufs/dirhash.h>
#endif
/*
* Last reference to an inode. If necessary, write or delete it.
@ -167,6 +171,10 @@ ufs_reclaim(ap)
}
#endif
lockdestroy(&vp->v_lock);
#ifdef UFS_DIRHASH
if (ip->i_dirhash != NULL)
ufsdirhash_free(ip);
#endif
FREE(vp->v_data, VFSTOUFS(vp->v_mount)->um_malloctype);
vp->v_data = 0;
return (0);

View File

@ -39,6 +39,8 @@
* $FreeBSD$
*/
#include "opt_ufs.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
@ -58,6 +60,9 @@
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
@ -128,7 +133,7 @@ ufs_lookup(ap)
register struct vnode *vdp; /* vnode for directory being searched */
register struct inode *dp; /* inode for directory being searched */
struct buf *bp; /* a buffer of directory entries */
register struct direct *ep; /* the current directory entry */
struct direct *ep; /* the current directory entry */
int entryoffsetinblock; /* offset of ep in bp's buffer */
enum {NONE, COMPACT, FOUND} slotstatus;
doff_t slotoffset; /* offset of area with free space */
@ -181,7 +186,48 @@ ufs_lookup(ap)
slotstatus = NONE;
slotneeded = DIRECTSIZ(cnp->cn_namelen);
}
bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
#ifdef UFS_DIRHASH
/*
* Use dirhash for fast operations on large directories. The logic
* to determine whether to hash the directory is contained within
* ufsdirhash_build(); a zero return means that it decided to hash
* this directory and it successfully built up the hash table.
*/
if (ufsdirhash_build(dp) == 0) {
/* Look for a free slot if needed. */
enduseful = dp->i_size;
if (slotstatus != FOUND) {
slotoffset = ufsdirhash_findfree(dp, slotneeded,
&slotsize);
if (slotoffset >= 0) {
slotstatus = COMPACT;
enduseful = ufsdirhash_enduseful(dp);
if (enduseful < 0)
enduseful = dp->i_size;
}
}
/* Look up the component. */
numdirpasses = 1;
entryoffsetinblock = 0; /* silence compiler warning */
switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
&dp->i_offset, nameiop == DELETE ? &prevoff : NULL)) {
case 0:
error = UFS_BLKATOFF(vdp, (off_t)dp->i_offset,
(char **)&ep, &bp);
if (error)
return (error);
goto foundentry;
case ENOENT:
dp->i_offset = roundup2(dp->i_size, DIRBLKSIZ);
goto notfound;
default:
/* Something failed; just do a linear search. */
break;
}
}
#endif /* UFS_DIRHASH */
/*
* If there is cached information on a previous search of
* this directory, pick up where we last left off.
@ -193,7 +239,6 @@ ufs_lookup(ap)
* profiling time and hence has been removed in the interest
* of simplicity.
*/
bmask = VFSTOUFS(vdp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
if (nameiop != LOOKUP || dp->i_diroff == 0 ||
dp->i_diroff >= dp->i_size) {
entryoffsetinblock = 0;
@ -299,6 +344,9 @@ ufs_lookup(ap)
(cnp->cn_nameptr[0] == ep->d_name[0]) &&
!bcmp(cnp->cn_nameptr, ep->d_name,
(unsigned)namlen)) {
#ifdef UFS_DIRHASH
foundentry:
#endif
/*
* Save directory entry's inode number and
* reclen in ndp->ni_ufs area, and release
@ -732,6 +780,14 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
blkoff = dp->i_offset &
(VFSTOUFS(dvp->v_mount)->um_mountp->mnt_stat.f_iosize - 1);
bcopy((caddr_t)dirp, (caddr_t)bp->b_data + blkoff,newentrysize);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) {
ufsdirhash_newblk(dp, dp->i_offset);
ufsdirhash_add(dp, dirp, dp->i_offset);
ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff,
dp->i_offset);
}
#endif
if (DOINGSOFTDEP(dvp)) {
/*
* Ensure that the entire newly allocated block is a
@ -828,6 +884,11 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
}
dsize = DIRSIZ(OFSFMT(dvp), nep);
spacefree += nep->d_reclen - dsize;
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL)
ufsdirhash_move(dp, nep, dp->i_offset + loc,
dp->i_offset + ((char *)ep - dirbuf));
#endif
loc += nep->d_reclen;
if (DOINGSOFTDEP(dvp))
softdep_change_directoryentry_offset(dp, dirbuf,
@ -852,7 +913,18 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
ep->d_reclen = dsize;
ep = (struct direct *)((char *)ep + dsize);
}
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL && (ep->d_ino == 0 ||
dirp->d_reclen == spacefree))
ufsdirhash_add(dp, dirp, dp->i_offset + ((char *)ep - dirbuf));
#endif
bcopy((caddr_t)dirp, (caddr_t)ep, (u_int)newentrysize);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL)
ufsdirhash_checkblock(dp, dirbuf -
(dp->i_offset & (DIRBLKSIZ - 1)),
dp->i_offset & ~(DIRBLKSIZ - 1));
#endif
if (DOINGSOFTDEP(dvp)) {
(void) softdep_setup_directory_add(bp, dp,
@ -878,6 +950,10 @@ ufs_direnter(dvp, tvp, dirp, cnp, newdirbp)
if (error == 0 && dp->i_endoff && dp->i_endoff < dp->i_size) {
if (tvp != NULL)
VOP_UNLOCK(tvp, 0, p);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL)
ufsdirhash_dirtrunc(dp, dp->i_endoff);
#endif
(void) UFS_TRUNCATE(dvp, (off_t)dp->i_endoff, IO_SYNC, cr, p);
if (tvp != NULL)
vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, p);
@ -926,6 +1002,15 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
if ((error = UFS_BLKATOFF(dvp,
(off_t)(dp->i_offset - dp->i_count), (char **)&ep, &bp)) != 0)
return (error);
#ifdef UFS_DIRHASH
/*
* Remove the dirhash entry. This is complicated by the fact
* that `ep' is the previous entry when dp->i_count != 0.
*/
if (dp->i_dirhash != NULL)
ufsdirhash_remove(dp, (dp->i_count == 0) ? ep :
(struct direct *)((char *)ep + ep->d_reclen), dp->i_offset);
#endif
if (dp->i_count == 0) {
/*
* First entry in block: set d_ino to zero.
@ -937,6 +1022,12 @@ ufs_dirremove(dvp, ip, flags, isrmdir)
*/
ep->d_reclen += dp->i_reclen;
}
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL)
ufsdirhash_checkblock(dp, (char *)ep -
((dp->i_offset - dp->i_count) & (DIRBLKSIZ - 1)),
dp->i_offset & ~(DIRBLKSIZ - 1));
#endif
out:
if (DOINGSOFTDEP(dvp)) {
if (ip) {