diff --git a/sbin/newfs/mkfs.c b/sbin/newfs/mkfs.c index 8386c1ae80a5..9ac5502d81cf 100644 --- a/sbin/newfs/mkfs.c +++ b/sbin/newfs/mkfs.c @@ -119,6 +119,8 @@ extern int maxbpg; /* maximum blocks per file in a cyl group */ extern int nrpos; /* # of distinguished rotational positions */ extern int bbsize; /* boot block size */ extern int sbsize; /* superblock size */ +extern int avgfilesize; /* expected average file size */ +extern int avgfilesperdir; /* expected number of files per directory */ extern u_long memleft; /* virtual memory available */ extern caddr_t membase; /* start address of memory based filesystem */ extern char * filename; @@ -273,6 +275,17 @@ mkfs(pp, fsys, fi, fo) printf("preposterous ntrak %d\n", sblock.fs_ntrak), exit(14); if (sblock.fs_nsect <= 0) printf("preposterous nsect %d\n", sblock.fs_nsect), exit(15); + /* + * collect and verify the filesystem density info + */ + sblock.fs_avgfilesize = avgfilesize; + sblock.fs_avgfpdir = avgfilesperdir; + if (sblock.fs_avgfilesize <= 0) + printf("illegal expected average file size %d\n", + sblock.fs_avgfilesize), exit(14); + if (sblock.fs_avgfpdir <= 0) + printf("illegal expected number of files per directory %d\n", + sblock.fs_avgfpdir), exit(15); /* * collect and verify the block and fragment sizes */ diff --git a/sbin/newfs/newfs.c b/sbin/newfs/newfs.c index e75599397a68..8eed25682a06 100644 --- a/sbin/newfs/newfs.c +++ b/sbin/newfs/newfs.c @@ -163,9 +163,7 @@ void fatal(); * The number of sectors are used to determine the size of a cyl-group. * Kirk suggested one or two meg per "cylinder" so we say two. */ - #define NTRACKS 1 /* number of heads */ - #define NSECTORS 4096 /* number of sectors */ int mfs; /* run as the memory based filesystem */ @@ -199,6 +197,8 @@ int maxcontig = 0; /* max contiguous blocks to allocate */ int rotdelay = ROTDELAY; /* rotational delay between blocks */ int maxbpg; /* maximum blocks per file in a cyl group */ int nrpos = NRPOS; /* # of distinguished rotational positions */ +int avgfilesize = AVFILESIZ;/* expected average file size */ +int avgfilesperdir = AFPDIR;/* expected number of files per directory */ int bbsize = BBSIZE; /* boot block size */ int sbsize = SBSIZE; /* superblock size */ int mntflags = MNT_ASYNC; /* flags to be passed to mount */ @@ -256,8 +256,8 @@ main(argc, argv) } opstring = mfs ? - "NF:T:Ua:b:c:d:e:f:i:m:o:s:" : - "NOS:T:Ua:b:c:d:e:f:i:k:l:m:n:o:p:r:s:t:u:vx:"; + "NF:T:Ua:b:c:d:e:f:g:h:i:m:o:s:" : + "NOS:T:Ua:b:c:d:e:f:g:h:i:k:l:m:n:o:p:r:s:t:u:vx:"; while ((ch = getopt(argc, argv, opstring)) != -1) switch (ch) { case 'N': @@ -308,6 +308,14 @@ main(argc, argv) if ((fsize = atoi(optarg)) <= 0) fatal("%s: bad fragment size", optarg); break; + case 'g': + if ((avgfilesize = atoi(optarg)) <= 0) + fatal("%s: bad average file size", optarg); + break; + case 'h': + if ((avgfilesperdir = atoi(optarg)) <= 0) + fatal("%s: bad average files per dir", optarg); + break; case 'i': if ((density = atoi(optarg)) <= 0) fatal("%s: bad bytes per inode", optarg); @@ -768,6 +776,8 @@ usage() fprintf(stderr, "\t-d rotational delay between contiguous blocks\n"); fprintf(stderr, "\t-e maximum blocks per file in a cylinder group\n"); fprintf(stderr, "\t-f frag size\n"); + fprintf(stderr, "\t-g average file size\n"); + fprintf(stderr, "\t-h average files per directory\n"); fprintf(stderr, "\t-i number of bytes per inode\n"); fprintf(stderr, "\t-k sector 0 skew, per track\n"); fprintf(stderr, "\t-l hardware sector interleave\n"); diff --git a/sbin/tunefs/tunefs.c b/sbin/tunefs/tunefs.c index 734e01d75204..1fda3f794c0f 100644 --- a/sbin/tunefs/tunefs.c +++ b/sbin/tunefs/tunefs.c @@ -91,9 +91,10 @@ main(argc, argv) char *special, *name; struct stat st; int Aflag = 0, active = 0; - int aflag = 0, dflag = 0, eflag = 0, mflag = 0; - int nflag = 0, oflag = 0, pflag = 0; - int avalue = 0, dvalue = 0, evalue = 0, mvalue = 0, ovalue = 0; + int aflag = 0, dflag = 0, eflag = 0, fflag = 0, mflag = 0; + int nflag = 0, oflag = 0, pflag = 0, sflag = 0; + int avalue = 0, dvalue = 0, evalue = 0, fvalue = 0; + int mvalue = 0, ovalue = 0, svalue = 0; char *nvalue = NULL; struct fstab *fs; char *chg[2], device[MAXPATHLEN]; @@ -104,7 +105,7 @@ main(argc, argv) if (argc < 3) usage(); found_arg = 0; /* at least one arg is required */ - while ((ch = getopt(argc, argv, "Aa:d:e:m:n:o:p")) != -1) + while ((ch = getopt(argc, argv, "Aa:d:e:f:m:n:o:ps:")) != -1) switch (ch) { case 'A': found_arg = 1; @@ -132,6 +133,14 @@ main(argc, argv) errx(10, "%s must be >= 1 (was %s)", name, optarg); eflag = 1; break; + case 'f': + found_arg = 1; + name = "average file size"; + fvalue = atoi(optarg); + if (fvalue < 1) + errx(10, "%s must be >= 1 (was %s)", name, optarg); + fflag = 1; + break; case 'm': found_arg = 1; name = "minimum percentage of free space"; @@ -168,6 +177,14 @@ main(argc, argv) found_arg = 1; pflag = 1; break; + case 's': + found_arg = 1; + name = "expected number of files per directory"; + svalue = atoi(optarg); + if (svalue < 1) + errx(10, "%s must be >= 1 (was %s)", name, optarg); + sflag = 1; + break; default: usage(); } @@ -239,6 +256,17 @@ main(argc, argv) sblock.fs_maxbpg = evalue; } } + if (fflag) { + name = "average file size"; + if (sblock.fs_avgfilesize == fvalue) { + warnx("%s remains unchanged as %d", name, fvalue); + } + else { + warnx("%s changes from %d to %d", + name, sblock.fs_avgfilesize, fvalue); + sblock.fs_avgfilesize = fvalue; + } + } if (mflag) { name = "minimum percentage of free space"; if (sblock.fs_minfree == mvalue) { @@ -291,6 +319,17 @@ main(argc, argv) warnx(OPTWARN, "space", "<", MINFREE); } } + if (sflag) { + name = "expected number of files per directory"; + if (sblock.fs_avgfpdir == svalue) { + warnx("%s remains unchanged as %d", name, svalue); + } + else { + warnx("%s changes from %d to %d", + name, sblock.fs_avgfpdir, svalue); + sblock.fs_avgfpdir = svalue; + } + } putsb(&sblock, special, Aflag); if (active) { @@ -307,9 +346,9 @@ void usage() { fprintf(stderr, "%s\n%s\n%s\n", -"usage: tunefs [-A] [-a maxcontig] [-d rotdelay] [-e maxbpg] [-m minfree]", -" [-p] [-n enable | disable] [-o space | time]", -" special | filesystem"); +"usage: tunefs [-A] [-a maxcontig] [-d rotdelay] [-e maxbpg] [-f avgfilesize]", +" [-m minfree] [-p] [-n enable | disable] [-o space | time]", +" [-s filesperdir] special | filesystem"); exit(2); } @@ -366,6 +405,10 @@ printfs() sblock.fs_rotdelay); warnx("maximum blocks per file in a cylinder group: (-e) %d", sblock.fs_maxbpg); + warnx("average file size: (-f) %d", + sblock.fs_avgfilesize); + warnx("average number of files in a directory: (-s) %d", + sblock.fs_avgfpdir); warnx("minimum percentage of free space: (-m) %d%%", sblock.fs_minfree); warnx("optimization preference: (-o) %s", diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c index 9476933a3899..81fb75ebaffb 100644 --- a/sys/ufs/ffs/ffs_alloc.c +++ b/sys/ufs/ffs/ffs_alloc.c @@ -71,7 +71,7 @@ static void ffs_clusteracct __P((struct fs *, struct cg *, ufs_daddr_t, int)); static ufs_daddr_t ffs_clusteralloc __P((struct inode *, int, ufs_daddr_t, int)); -static ino_t ffs_dirpref __P((struct fs *)); +static ino_t ffs_dirpref __P((struct inode *)); static ufs_daddr_t ffs_fragextend __P((struct inode *, int, long, int, int)); static void ffs_fserr __P((struct fs *, u_int, char *)); static u_long ffs_hashalloc @@ -593,12 +593,23 @@ ffs_valloc(pvp, mode, cred, vpp) goto noinodes; if ((mode & IFMT) == IFDIR) - ipref = ffs_dirpref(fs); + ipref = ffs_dirpref(pip); else ipref = pip->i_number; if (ipref >= fs->fs_ncg * fs->fs_ipg) ipref = 0; cg = ino_to_cg(fs, ipref); + /* + * Track number of dirs created one after another + * in a same cg without intervening by files. + */ + if ((mode & IFMT) == IFDIR) { + if (fs->fs_contigdirs[cg] < 255) + fs->fs_contigdirs[cg]++; + } else { + if (fs->fs_contigdirs[cg] > 0) + fs->fs_contigdirs[cg]--; + } ino = (ino_t)ffs_hashalloc(pip, cg, (long)ipref, mode, (allocfcn_t *)ffs_nodealloccg); if (ino == 0) @@ -633,28 +644,112 @@ ffs_valloc(pvp, mode, cred, vpp) } /* - * Find a cylinder to place a directory. + * Find a cylinder group to place a directory. * - * The policy implemented by this algorithm is to select from - * among those cylinder groups with above the average number of - * free inodes, the one with the smallest number of directories. + * The policy implemented by this algorithm is to allocate a + * directory inode in the same cylinder group as its parent + * directory, but also to reserve space for its files inodes + * and data. Restrict the number of directories which may be + * allocated one after another in the same cylinder group + * without intervening allocation of files. + * + * If we allocate a first level directory then force allocation + * in another cylinder group. */ static ino_t -ffs_dirpref(fs) - register struct fs *fs; +ffs_dirpref(pip) + struct inode *pip; { - int cg, minndir, mincg, avgifree; + register struct fs *fs; + int cg, prefcg, dirsize, cgsize; + int avgifree, avgbfree, avgndir, curdirsize; + int minifree, minbfree, maxndir; + int mincg, minndir; + int maxcontigdirs; + + fs = pip->i_fs; avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg; - minndir = fs->fs_ipg; - mincg = 0; - for (cg = 0; cg < fs->fs_ncg; cg++) - if (fs->fs_cs(fs, cg).cs_ndir < minndir && - fs->fs_cs(fs, cg).cs_nifree >= avgifree) { - mincg = cg; - minndir = fs->fs_cs(fs, cg).cs_ndir; + avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg; + avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg; + + /* + * Force allocation in another cg if creating a first level dir. + */ + if (ITOV(pip)->v_flag & VROOT) { + prefcg = arc4random() % fs->fs_ncg; + mincg = prefcg; + minndir = fs->fs_ipg; + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree && + fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < minndir && + fs->fs_cs(fs, cg).cs_nifree >= avgifree && + fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) { + mincg = cg; + minndir = fs->fs_cs(fs, cg).cs_ndir; + } + return ((ino_t)(fs->fs_ipg * mincg)); + } + + /* + * Count various limits which used for + * optimal allocation of a directory inode. + */ + maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg); + minifree = avgifree - fs->fs_ipg / 4; + if (minifree < 0) + minifree = 0; + minbfree = avgbfree - fs->fs_fpg / fs->fs_frag / 4; + if (minbfree < 0) + minbfree = 0; + cgsize = fs->fs_fsize * fs->fs_fpg; + dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir; + curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0; + if (dirsize < curdirsize) + dirsize = curdirsize; + maxcontigdirs = min(cgsize / dirsize, 255); + if (fs->fs_avgfpdir > 0) + maxcontigdirs = min(maxcontigdirs, + fs->fs_ipg / fs->fs_avgfpdir); + if (maxcontigdirs == 0) + maxcontigdirs = 1; + + /* + * Limit number of dirs in one cg and reserve space for + * regular files, but only if we have no deficit in + * inodes or space. + */ + prefcg = ino_to_cg(fs, pip->i_number); + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < maxndir && + fs->fs_cs(fs, cg).cs_nifree >= minifree && + fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { + if (fs->fs_contigdirs[cg] < maxcontigdirs) + return ((ino_t)(fs->fs_ipg * cg)); } - return ((ino_t)(fs->fs_ipg * mincg)); + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_ndir < maxndir && + fs->fs_cs(fs, cg).cs_nifree >= minifree && + fs->fs_cs(fs, cg).cs_nbfree >= minbfree) { + if (fs->fs_contigdirs[cg] < maxcontigdirs) + return ((ino_t)(fs->fs_ipg * cg)); + } + /* + * This is a backstop when we have deficit in space. + */ + for (cg = prefcg; cg < fs->fs_ncg; cg++) + if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) + return ((ino_t)(fs->fs_ipg * cg)); + for (cg = 0; cg < prefcg; cg++) + if (fs->fs_cs(fs, cg).cs_nifree >= avgifree) + break; + return ((ino_t)(fs->fs_ipg * cg)); } /* diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c index 9803a222ab5a..17fa4316863a 100644 --- a/sys/ufs/ffs/ffs_vfsops.c +++ b/sys/ufs/ffs/ffs_vfsops.c @@ -624,6 +624,7 @@ ffs_mountfs(devvp, mp, p, malloctype) blks = howmany(size, fs->fs_fsize); if (fs->fs_contigsumsize > 0) size += fs->fs_ncg * sizeof(int32_t); + size += fs->fs_ncg * sizeof(u_int8_t); space = malloc((u_long)size, M_UFSMNT, M_WAITOK); fs->fs_csp = space; for (i = 0; i < blks; i += fs->fs_frag) { @@ -645,6 +646,15 @@ ffs_mountfs(devvp, mp, p, malloctype) for (i = 0; i < fs->fs_ncg; i++) *lp++ = fs->fs_contigsumsize; } + size = fs->fs_ncg * sizeof(u_int8_t); + fs->fs_contigdirs = (u_int8_t *)space; + space = (u_int8_t *)space + size; + bzero(fs->fs_contigdirs, size); + /* Compatibility for old filesystems XXX */ + if (fs->fs_avgfilesize <= 0) /* XXX */ + fs->fs_avgfilesize = AVFILESIZ; /* XXX */ + if (fs->fs_avgfpdir <= 0) /* XXX */ + fs->fs_avgfpdir = AFPDIR; /* XXX */ mp->mnt_data = (qaddr_t)ump; mp->mnt_stat.f_fsid.val[0] = fs->fs_id[0]; mp->mnt_stat.f_fsid.val[1] = fs->fs_id[1]; diff --git a/sys/ufs/ffs/fs.h b/sys/ufs/ffs/fs.h index 4083c14161ab..c11a9f8d5b43 100644 --- a/sys/ufs/ffs/fs.h +++ b/sys/ufs/ffs/fs.h @@ -108,15 +108,17 @@ /* * There is a 128-byte region in the superblock reserved for in-core * pointers to summary information. Originally this included an array - * of pointers to blocks of struct csum; now there are just two + * of pointers to blocks of struct csum; now there are just three * pointers and the remaining space is padded with fs_ocsp[]. * * NOCSPTRS determines the size of this padding. One pointer (fs_csp) * is taken away to point to a contiguous array of struct csum for * all cylinder groups; a second (fs_maxcluster) points to an array - * of cluster sizes that is computed as cylinder groups are inspected. + * of cluster sizes that is computed as cylinder groups are inspected, + * and the third points to an array that tracks the creation of new + * directories. */ -#define NOCSPTRS ((128 / sizeof(void *)) - 2) +#define NOCSPTRS ((128 / sizeof(void *)) - 3) /* * A summary of contiguous blocks of various sizes is maintained @@ -141,6 +143,18 @@ #define MINFREE 8 #define DEFAULTOPT FS_OPTTIME +/* + * Grigoriy Orlov has done some extensive work to fine + * tune the layout preferences for directories within a filesystem. + * His algorithm can be tuned by adjusting the following parameters + * which tell the system the average file size and the average number + * of files per directory. These defaults are well selected for typical + * filesystems, but may need to be tuned for odd cases like filesystems + * being used for sqiud caches or news spools. + */ +#define AVFILESIZ 16384 /* expected average file size */ +#define AFPDIR 64 /* expected number of files per directory */ + /* * The maximum number of snapshot nodes that can be associated * with each filesystem. This limit affects only the number of @@ -273,12 +287,15 @@ struct fs { /* these fields retain the current block allocation info */ int32_t fs_cgrotor; /* last cg searched */ void *fs_ocsp[NOCSPTRS]; /* padding; was list of fs_cs buffers */ + u_int8_t *fs_contigdirs; /* # of contiguously allocated dirs */ struct csum *fs_csp; /* cg summary info buffer for fs_cs */ int32_t *fs_maxcluster; /* max cluster in each cyl group */ int32_t fs_cpc; /* cyl per cycle in postbl */ int16_t fs_opostbl[16][8]; /* old rotation block list head */ int32_t fs_snapinum[FSMAXSNAP];/* list of snapshot inode numbers */ - int32_t fs_sparecon[30]; /* reserved for future constants */ + int32_t fs_avgfilesize; /* expected average file size */ + int32_t fs_avgfpdir; /* expected # of files per directory */ + int32_t fs_sparecon[28]; /* reserved for future constants */ int32_t fs_contigsumsize; /* size of cluster summary array */ int32_t fs_maxsymlinklen; /* max length of an internal symlink */ int32_t fs_inodefmt; /* format of on-disk inodes */