2005-01-07 02:29:27 +00:00
|
|
|
/*-
|
2017-12-27 19:13:50 +00:00
|
|
|
* SPDX-License-Identifier: (BSD-2-Clause-FreeBSD AND BSD-3-Clause)
|
2017-11-20 19:43:44 +00:00
|
|
|
*
|
2002-06-21 06:18:05 +00:00
|
|
|
* Copyright (c) 2002 Networks Associates Technology, Inc.
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* This software was developed for the FreeBSD Project by Marshall
|
|
|
|
* Kirk McKusick and Network Associates Laboratories, the Security
|
|
|
|
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
|
|
|
|
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
|
|
|
|
* research program
|
|
|
|
*
|
2004-10-20 08:05:02 +00:00
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
1994-05-24 10:09:53 +00:00
|
|
|
* Copyright (c) 1982, 1986, 1989, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
2017-02-28 23:42:47 +00:00
|
|
|
* 3. Neither the name of the University nor the names of its contributors
|
1994-05-24 10:09:53 +00:00
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
1997-02-10 02:22:35 +00:00
|
|
|
* @(#)ffs_alloc.c 8.18 (Berkeley) 5/26/95
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
|
2003-06-11 06:34:30 +00:00
|
|
|
#include <sys/cdefs.h>
|
|
|
|
__FBSDID("$FreeBSD$");
|
|
|
|
|
1996-01-05 18:31:58 +00:00
|
|
|
#include "opt_quota.h"
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/param.h>
|
2014-03-16 10:55:57 +00:00
|
|
|
#include <sys/capsicum.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/systm.h>
|
2000-05-05 09:59:14 +00:00
|
|
|
#include <sys/bio.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/buf.h>
|
1999-08-23 20:35:21 +00:00
|
|
|
#include <sys/conf.h>
|
2010-01-11 20:44:05 +00:00
|
|
|
#include <sys/fcntl.h>
|
2001-03-21 04:09:01 +00:00
|
|
|
#include <sys/file.h>
|
2003-01-01 01:56:19 +00:00
|
|
|
#include <sys/filedesc.h>
|
2006-11-06 13:42:10 +00:00
|
|
|
#include <sys/priv.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/proc.h>
|
|
|
|
#include <sys/vnode.h>
|
|
|
|
#include <sys/mount.h>
|
1998-11-13 01:01:44 +00:00
|
|
|
#include <sys/kernel.h>
|
2010-01-11 20:44:05 +00:00
|
|
|
#include <sys/syscallsubr.h>
|
1995-12-17 21:14:36 +00:00
|
|
|
#include <sys/sysctl.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <sys/syslog.h>
|
2010-12-29 12:25:28 +00:00
|
|
|
#include <sys/taskqueue.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2010-01-11 20:44:05 +00:00
|
|
|
#include <security/audit/audit.h>
|
|
|
|
|
2010-12-29 12:25:28 +00:00
|
|
|
#include <geom/geom.h>
|
|
|
|
|
2010-01-11 20:44:05 +00:00
|
|
|
#include <ufs/ufs/dir.h>
|
Introduce extended attribute support for FFS, allowing arbitrary
(name, value) pairs to be associated with inodes. This support is
used for ACLs, MAC labels, and Capabilities in the TrustedBSD
security extensions, which are currently under development.
In this implementation, attributes are backed to data vnodes in the
style of the quota support in FFS. Support for FFS extended
attributes may be enabled using the FFS_EXTATTR kernel option
(disabled by default). Userland utilities and man pages will be
committed in the next batch. VFS interfaces and man pages have
been in the repo since 4.0-RELEASE and are unchanged.
o ufs/ufs/extattr.h: UFS-specific extattr defines
o ufs/ufs/ufs_extattr.c: bulk of support routines
o ufs/{ufs,ffs,mfs}/*.[ch]: hooks and extattr.h includes
o contrib/softupdates/ffs_softdep.c: extattr.h includes
o conf/options, conf/files, i386/conf/LINT: added FFS_EXTATTR
o coda/coda_vfsops.c: XXX required extattr.h due to ufsmount.h
(This should not be the case, and will be fixed in a future commit)
Currently attributes are not supported in MFS. This will be fixed.
Reviewed by: adrian, bp, freebsd-fs, other unthanked souls
Obtained from: TrustedBSD Project
2000-04-15 03:34:27 +00:00
|
|
|
#include <ufs/ufs/extattr.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
#include <ufs/ufs/quota.h>
|
|
|
|
#include <ufs/ufs/inode.h>
|
1998-11-13 01:01:44 +00:00
|
|
|
#include <ufs/ufs/ufs_extern.h>
|
VFS mega cleanup commit (x/N)
1. Add new file "sys/kern/vfs_default.c" where default actions for
VOPs go. Implement proper defaults for ABORTOP, BWRITE, LEASE,
POLL, REVOKE and STRATEGY. Various stuff spread over the entire
tree belongs here.
2. Change VOP_BLKATOFF to a normal function in cd9660.
3. Kill VOP_BLKATOFF, VOP_TRUNCATE, VOP_VFREE, VOP_VALLOC. These
are private interface functions between UFS and the underlying
storage manager layer (FFS/LFS/MFS/EXT2FS). The functions now
live in struct ufsmount instead.
4. Remove a kludge of VOP_ functions in all filesystems, that did
nothing but obscure the simplicity and break the expandability.
If a filesystem doesn't implement VOP_FOO, it shouldn't have an
entry for it in its vnops table. The system will try to DTRT
if it is not implemented. There are still some cruft left, but
the bulk of it is done.
5. Fix another VCALL in vfs_cache.c (thanks Bruce!)
1997-10-16 10:50:27 +00:00
|
|
|
#include <ufs/ufs/ufsmount.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
#include <ufs/ffs/fs.h>
|
|
|
|
#include <ufs/ffs/ffs_extern.h>
|
2010-12-29 12:25:28 +00:00
|
|
|
#include <ufs/ffs/softdep.h>
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2010-02-10 20:10:35 +00:00
|
|
|
typedef ufs2_daddr_t allocfcn_t(struct inode *ip, u_int cg, ufs2_daddr_t bpref,
|
2010-04-24 07:05:35 +00:00
|
|
|
int size, int rsize);
|
1995-12-03 11:17:15 +00:00
|
|
|
|
2010-04-24 07:05:35 +00:00
|
|
|
static ufs2_daddr_t ffs_alloccg(struct inode *, u_int, ufs2_daddr_t, int, int);
|
2002-06-21 06:18:05 +00:00
|
|
|
static ufs2_daddr_t
|
2010-04-24 07:05:35 +00:00
|
|
|
ffs_alloccgblk(struct inode *, struct buf *, ufs2_daddr_t, int);
|
2010-12-29 12:25:28 +00:00
|
|
|
static void ffs_blkfree_cg(struct ufsmount *, struct fs *,
|
|
|
|
struct vnode *, ufs2_daddr_t, long, ino_t,
|
|
|
|
struct workhead *);
|
|
|
|
static void ffs_blkfree_trim_completed(struct bio *);
|
|
|
|
static void ffs_blkfree_trim_task(void *ctx, int pending __unused);
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
2002-06-21 06:18:05 +00:00
|
|
|
static int ffs_checkblk(struct inode *, ufs2_daddr_t, long);
|
1997-11-22 08:35:46 +00:00
|
|
|
#endif
|
2015-04-24 23:27:50 +00:00
|
|
|
static ufs2_daddr_t ffs_clusteralloc(struct inode *, u_int, ufs2_daddr_t, int);
|
2002-03-19 22:40:48 +00:00
|
|
|
static ino_t ffs_dirpref(struct inode *);
|
2010-02-10 20:10:35 +00:00
|
|
|
static ufs2_daddr_t ffs_fragextend(struct inode *, u_int, ufs2_daddr_t,
|
|
|
|
int, int);
|
2002-06-21 06:18:05 +00:00
|
|
|
static ufs2_daddr_t ffs_hashalloc
|
2010-04-24 07:05:35 +00:00
|
|
|
(struct inode *, u_int, ufs2_daddr_t, int, int, allocfcn_t *);
|
|
|
|
static ufs2_daddr_t ffs_nodealloccg(struct inode *, u_int, ufs2_daddr_t, int,
|
|
|
|
int);
|
2002-06-21 06:18:05 +00:00
|
|
|
static ufs1_daddr_t ffs_mapsearch(struct fs *, struct cg *, ufs2_daddr_t, int);
|
|
|
|
static int ffs_reallocblks_ufs1(struct vop_reallocblks_args *);
|
|
|
|
static int ffs_reallocblks_ufs2(struct vop_reallocblks_args *);
|
2017-09-22 12:45:15 +00:00
|
|
|
static void ffs_ckhash_cg(struct buf *);
|
1994-05-25 09:21:21 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Allocate a block in the filesystem.
|
1995-05-30 08:16:23 +00:00
|
|
|
*
|
1994-05-24 10:09:53 +00:00
|
|
|
* The size of the requested block is given, which must be some
|
|
|
|
* multiple of fs_fsize and <= fs_bsize.
|
|
|
|
* A preference may be optionally specified. If a preference is given
|
|
|
|
* the following hierarchy is used to allocate a block:
|
|
|
|
* 1) allocate the requested block.
|
|
|
|
* 2) allocate a rotationally optimal block in the same cylinder.
|
|
|
|
* 3) allocate a block in the same cylinder group.
|
|
|
|
* 4) quadradically rehash into other cylinder groups, until an
|
|
|
|
* available block is located.
|
2007-01-16 19:35:43 +00:00
|
|
|
* If no block preference is given the following hierarchy is used
|
1994-05-24 10:09:53 +00:00
|
|
|
* to allocate a block:
|
|
|
|
* 1) allocate a block in the cylinder group that contains the
|
|
|
|
* inode for the file.
|
|
|
|
* 2) quadradically rehash into other cylinder groups, until an
|
|
|
|
* available block is located.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2009-01-27 21:48:47 +00:00
|
|
|
ffs_alloc(ip, lbn, bpref, size, flags, cred, bnp)
|
2002-05-13 09:22:31 +00:00
|
|
|
struct inode *ip;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t lbn, bpref;
|
2009-01-27 21:48:47 +00:00
|
|
|
int size, flags;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct ucred *cred;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t *bnp;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-05-13 09:22:31 +00:00
|
|
|
struct fs *fs;
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t bno;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int cg, reclaimed;
|
2005-10-31 20:33:28 +00:00
|
|
|
static struct timeval lastfail;
|
|
|
|
static int curfail;
|
2007-02-23 20:23:35 +00:00
|
|
|
int64_t delta;
|
1995-02-14 06:14:28 +00:00
|
|
|
#ifdef QUOTA
|
|
|
|
int error;
|
|
|
|
#endif
|
1995-05-30 08:16:23 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
*bnp = 0;
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ITOUMP(ip);
|
|
|
|
fs = ump->um_fs;
|
2005-01-24 10:08:35 +00:00
|
|
|
mtx_assert(UFS_MTX(ump), MA_OWNED);
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
1994-05-24 10:09:53 +00:00
|
|
|
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
|
1999-08-23 20:35:21 +00:00
|
|
|
printf("dev = %s, bsize = %ld, size = %d, fs = %s\n",
|
2016-09-17 16:47:34 +00:00
|
|
|
devtoname(ump->um_dev), (long)fs->fs_bsize, size,
|
1999-08-23 20:35:21 +00:00
|
|
|
fs->fs_fsmnt);
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("ffs_alloc: bad size");
|
|
|
|
}
|
|
|
|
if (cred == NOCRED)
|
1995-03-19 14:29:26 +00:00
|
|
|
panic("ffs_alloc: missing credential");
|
2007-11-08 17:21:51 +00:00
|
|
|
#endif /* INVARIANTS */
|
2002-01-22 06:17:22 +00:00
|
|
|
reclaimed = 0;
|
|
|
|
retry:
|
1994-05-24 10:09:53 +00:00
|
|
|
#ifdef QUOTA
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
2002-06-21 06:18:05 +00:00
|
|
|
error = chkdq(ip, btodb(size), cred, 0);
|
1994-10-10 01:04:55 +00:00
|
|
|
if (error)
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
2005-01-24 10:08:35 +00:00
|
|
|
if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
|
|
|
|
goto nospace;
|
2007-06-12 00:12:01 +00:00
|
|
|
if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) &&
|
2005-01-24 10:08:35 +00:00
|
|
|
freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
|
|
|
|
goto nospace;
|
1994-05-24 10:09:53 +00:00
|
|
|
if (bpref >= fs->fs_size)
|
|
|
|
bpref = 0;
|
|
|
|
if (bpref == 0)
|
|
|
|
cg = ino_to_cg(fs, ip->i_number);
|
|
|
|
else
|
|
|
|
cg = dtog(fs, bpref);
|
2010-04-24 07:05:35 +00:00
|
|
|
bno = ffs_hashalloc(ip, cg, bpref, size, size, ffs_alloccg);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (bno > 0) {
|
2007-02-23 20:23:35 +00:00
|
|
|
delta = btodb(size);
|
|
|
|
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
|
2009-01-27 21:48:47 +00:00
|
|
|
if (flags & IO_EXT)
|
|
|
|
ip->i_flag |= IN_CHANGE;
|
|
|
|
else
|
|
|
|
ip->i_flag |= IN_CHANGE | IN_UPDATE;
|
1994-05-24 10:09:53 +00:00
|
|
|
*bnp = bno;
|
|
|
|
return (0);
|
|
|
|
}
|
2007-01-20 11:58:32 +00:00
|
|
|
nospace:
|
1994-05-24 10:09:53 +00:00
|
|
|
#ifdef QUOTA
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Restore user's disk quota because allocation failed.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
(void) chkdq(ip, -btodb(size), cred, FORCE);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
2011-06-10 22:48:35 +00:00
|
|
|
if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
|
2002-01-22 06:17:22 +00:00
|
|
|
reclaimed = 1;
|
2011-04-05 21:26:05 +00:00
|
|
|
softdep_request_cleanup(fs, ITOV(ip), cred, FLUSH_BLOCKS_WAIT);
|
2002-01-22 06:17:22 +00:00
|
|
|
goto retry;
|
|
|
|
}
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
2011-06-15 18:05:08 +00:00
|
|
|
if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) {
|
2005-10-31 20:33:28 +00:00
|
|
|
ffs_fserr(fs, ip->i_number, "filesystem full");
|
|
|
|
uprintf("\n%s: write failed, filesystem is full\n",
|
|
|
|
fs->fs_fsmnt);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
return (ENOSPC);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reallocate a fragment to a bigger size
|
|
|
|
*
|
|
|
|
* The number and size of the old block is given, and a preference
|
|
|
|
* and new size is also specified. The allocator attempts to extend
|
|
|
|
* the original block. Failing that, the regular block allocator is
|
|
|
|
* invoked to get an appropriate block.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
2009-01-27 21:48:47 +00:00
|
|
|
ffs_realloccg(ip, lbprev, bprev, bpref, osize, nsize, flags, cred, bpp)
|
2002-05-13 09:22:31 +00:00
|
|
|
struct inode *ip;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t lbprev;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
ufs2_daddr_t bprev;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t bpref;
|
2009-01-27 21:48:47 +00:00
|
|
|
int osize, nsize, flags;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct ucred *cred;
|
|
|
|
struct buf **bpp;
|
|
|
|
{
|
2002-01-22 06:17:22 +00:00
|
|
|
struct vnode *vp;
|
|
|
|
struct fs *fs;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct buf *bp;
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int cg, request, reclaimed;
|
2013-03-19 15:08:15 +00:00
|
|
|
int error, gbflags;
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
ufs2_daddr_t bno;
|
2005-10-31 20:33:28 +00:00
|
|
|
static struct timeval lastfail;
|
|
|
|
static int curfail;
|
2007-02-23 20:23:35 +00:00
|
|
|
int64_t delta;
|
1995-05-30 08:16:23 +00:00
|
|
|
|
2002-01-22 06:17:22 +00:00
|
|
|
vp = ITOV(ip);
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ITOUMP(ip);
|
|
|
|
fs = ump->um_fs;
|
2005-01-24 10:08:35 +00:00
|
|
|
bp = NULL;
|
2013-03-19 15:08:15 +00:00
|
|
|
gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
|
|
|
|
|
2005-01-24 10:08:35 +00:00
|
|
|
mtx_assert(UFS_MTX(ump), MA_OWNED);
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
2002-01-22 06:17:22 +00:00
|
|
|
if (vp->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
|
2000-07-11 22:07:57 +00:00
|
|
|
panic("ffs_realloccg: allocation on suspended filesystem");
|
1994-05-24 10:09:53 +00:00
|
|
|
if ((u_int)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
|
|
|
|
(u_int)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
|
|
|
|
printf(
|
1999-08-23 20:35:21 +00:00
|
|
|
"dev = %s, bsize = %ld, osize = %d, nsize = %d, fs = %s\n",
|
2016-09-17 16:47:34 +00:00
|
|
|
devtoname(ump->um_dev), (long)fs->fs_bsize, osize,
|
1995-05-11 19:26:53 +00:00
|
|
|
nsize, fs->fs_fsmnt);
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("ffs_realloccg: bad size");
|
|
|
|
}
|
|
|
|
if (cred == NOCRED)
|
1995-03-19 14:29:26 +00:00
|
|
|
panic("ffs_realloccg: missing credential");
|
2007-11-08 17:21:51 +00:00
|
|
|
#endif /* INVARIANTS */
|
2002-01-22 06:17:22 +00:00
|
|
|
reclaimed = 0;
|
|
|
|
retry:
|
2007-06-12 00:12:01 +00:00
|
|
|
if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0) &&
|
2005-01-24 10:08:35 +00:00
|
|
|
freespace(fs, fs->fs_minfree) - numfrags(fs, nsize - osize) < 0) {
|
1994-05-24 10:09:53 +00:00
|
|
|
goto nospace;
|
2005-01-24 10:08:35 +00:00
|
|
|
}
|
Add support to UFS2 to provide storage for extended attributes.
As this code is not actually used by any of the existing
interfaces, it seems unlikely to break anything (famous
last words).
The internal kernel interface to manipulate these attributes
is invoked using two new IO_ flags: IO_NORMAL and IO_EXT.
These flags may be specified in the ioflags word of VOP_READ,
VOP_WRITE, and VOP_TRUNCATE. Specifying IO_NORMAL means that
you want to do I/O to the normal data part of the file and
IO_EXT means that you want to do I/O to the extended attributes
part of the file. IO_NORMAL and IO_EXT are mutually exclusive
for VOP_READ and VOP_WRITE, but may be specified individually
or together in the case of VOP_TRUNCATE. For example, when
removing a file, VOP_TRUNCATE is called with both IO_NORMAL
and IO_EXT set. For backward compatibility, if neither IO_NORMAL
nor IO_EXT is set, then IO_NORMAL is assumed.
Note that the BA_ and IO_ flags have been `merged' so that they
may both be used in the same flags word. This merger is possible
by assigning the IO_ flags to the low sixteen bits and the BA_
flags the high sixteen bits. This works because the high sixteen
bits of the IO_ word is reserved for read-ahead and help with
write clustering so will never be used for flags. This merge
lets us get away from code of the form:
if (ioflags & IO_SYNC)
flags |= BA_SYNC;
For the future, I have considered adding a new field to the
vattr structure, va_extsize. This addition could then be
exported through the stat structure to allow applications to
find out the size of the extended attribute storage and also
would provide a more standard interface for truncating them
(via VOP_SETATTR rather than VOP_TRUNCATE).
I am also contemplating adding a pathconf parameter (for
concreteness, lets call it _PC_MAX_EXTSIZE) which would
let an application determine the maximum size of the extended
atribute storage.
Sponsored by: DARPA & NAI Labs.
2002-07-19 07:29:39 +00:00
|
|
|
if (bprev == 0) {
|
2002-06-23 18:17:27 +00:00
|
|
|
printf("dev = %s, bsize = %ld, bprev = %jd, fs = %s\n",
|
2016-09-17 16:47:34 +00:00
|
|
|
devtoname(ump->um_dev), (long)fs->fs_bsize, (intmax_t)bprev,
|
1998-07-11 07:46:16 +00:00
|
|
|
fs->fs_fsmnt);
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("ffs_realloccg: bad bprev");
|
|
|
|
}
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Allocate the extra space in the buffer.
|
|
|
|
*/
|
2013-03-19 15:08:15 +00:00
|
|
|
error = bread_gb(vp, lbprev, osize, NOCRED, gbflags, &bp);
|
1994-10-10 01:04:55 +00:00
|
|
|
if (error) {
|
1994-05-24 10:09:53 +00:00
|
|
|
brelse(bp);
|
|
|
|
return (error);
|
|
|
|
}
|
1995-03-03 22:13:16 +00:00
|
|
|
|
2002-06-21 06:18:05 +00:00
|
|
|
if (bp->b_blkno == bp->b_lblkno) {
|
2017-02-15 19:50:26 +00:00
|
|
|
if (lbprev >= UFS_NDADDR)
|
1995-03-03 22:13:16 +00:00
|
|
|
panic("ffs_realloccg: lbprev out of range");
|
|
|
|
bp->b_blkno = fsbtodb(fs, bprev);
|
|
|
|
}
|
1995-05-30 08:16:23 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
#ifdef QUOTA
|
2002-06-21 06:18:05 +00:00
|
|
|
error = chkdq(ip, btodb(nsize - osize), cred, 0);
|
1994-10-10 01:04:55 +00:00
|
|
|
if (error) {
|
1994-05-24 10:09:53 +00:00
|
|
|
brelse(bp);
|
|
|
|
return (error);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
/*
|
|
|
|
* Check for extension in the existing location.
|
|
|
|
*/
|
2016-04-10 21:48:11 +00:00
|
|
|
*bpp = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
cg = dtog(fs, bprev);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
2002-06-21 06:18:05 +00:00
|
|
|
bno = ffs_fragextend(ip, cg, bprev, osize, nsize);
|
1994-10-10 01:04:55 +00:00
|
|
|
if (bno) {
|
1994-05-24 10:09:53 +00:00
|
|
|
if (bp->b_blkno != fsbtodb(fs, bno))
|
1997-03-09 06:00:44 +00:00
|
|
|
panic("ffs_realloccg: bad blockno");
|
2007-02-23 20:23:35 +00:00
|
|
|
delta = btodb(nsize - osize);
|
|
|
|
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
|
2009-01-27 21:48:47 +00:00
|
|
|
if (flags & IO_EXT)
|
|
|
|
ip->i_flag |= IN_CHANGE;
|
|
|
|
else
|
|
|
|
ip->i_flag |= IN_CHANGE | IN_UPDATE;
|
1995-03-26 23:29:13 +00:00
|
|
|
allocbuf(bp, nsize);
|
1994-05-24 10:09:53 +00:00
|
|
|
bp->b_flags |= B_DONE;
|
2013-03-19 15:08:15 +00:00
|
|
|
vfs_bio_bzero_buf(bp, osize, nsize - osize);
|
2009-05-17 20:26:00 +00:00
|
|
|
if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO)
|
|
|
|
vfs_bio_set_valid(bp, osize, nsize - osize);
|
1994-05-24 10:09:53 +00:00
|
|
|
*bpp = bp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Allocate a new disk location.
|
|
|
|
*/
|
|
|
|
if (bpref >= fs->fs_size)
|
|
|
|
bpref = 0;
|
|
|
|
switch ((int)fs->fs_optim) {
|
|
|
|
case FS_OPTSPACE:
|
|
|
|
/*
|
1995-05-30 08:16:23 +00:00
|
|
|
* Allocate an exact sized fragment. Although this makes
|
|
|
|
* best use of space, we will waste time relocating it if
|
1994-05-24 10:09:53 +00:00
|
|
|
* the file continues to grow. If the fragmentation is
|
|
|
|
* less than half of the minimum free reserve, we choose
|
|
|
|
* to begin optimizing for time.
|
|
|
|
*/
|
|
|
|
request = nsize;
|
1995-03-10 22:11:50 +00:00
|
|
|
if (fs->fs_minfree <= 5 ||
|
1994-05-24 10:09:53 +00:00
|
|
|
fs->fs_cstotal.cs_nffree >
|
2000-03-15 07:08:36 +00:00
|
|
|
(off_t)fs->fs_dsize * fs->fs_minfree / (2 * 100))
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
log(LOG_NOTICE, "%s: optimization changed from SPACE to TIME\n",
|
|
|
|
fs->fs_fsmnt);
|
|
|
|
fs->fs_optim = FS_OPTTIME;
|
|
|
|
break;
|
|
|
|
case FS_OPTTIME:
|
|
|
|
/*
|
|
|
|
* At this point we have discovered a file that is trying to
|
|
|
|
* grow a small fragment to a larger fragment. To save time,
|
|
|
|
* we allocate a full sized block, then free the unused portion.
|
|
|
|
* If the file continues to grow, the `ffs_fragextend' call
|
|
|
|
* above will be able to grow it in place without further
|
|
|
|
* copying. If aberrant programs cause disk fragmentation to
|
|
|
|
* grow within 2% of the free reserve, we choose to begin
|
|
|
|
* optimizing for space.
|
|
|
|
*/
|
|
|
|
request = fs->fs_bsize;
|
|
|
|
if (fs->fs_cstotal.cs_nffree <
|
2000-03-15 07:08:36 +00:00
|
|
|
(off_t)fs->fs_dsize * (fs->fs_minfree - 2) / 100)
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
log(LOG_NOTICE, "%s: optimization changed from TIME to SPACE\n",
|
|
|
|
fs->fs_fsmnt);
|
|
|
|
fs->fs_optim = FS_OPTSPACE;
|
|
|
|
break;
|
|
|
|
default:
|
1999-08-23 20:35:21 +00:00
|
|
|
printf("dev = %s, optim = %ld, fs = %s\n",
|
2016-09-17 16:47:34 +00:00
|
|
|
devtoname(ump->um_dev), (long)fs->fs_optim, fs->fs_fsmnt);
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("ffs_realloccg: bad optim");
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
2010-04-24 07:05:35 +00:00
|
|
|
bno = ffs_hashalloc(ip, cg, bpref, request, nsize, ffs_alloccg);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (bno > 0) {
|
|
|
|
bp->b_blkno = fsbtodb(fs, bno);
|
2002-01-22 06:17:22 +00:00
|
|
|
if (!DOINGSOFTDEP(vp))
|
2016-09-17 16:47:34 +00:00
|
|
|
ffs_blkfree(ump, fs, ump->um_devvp, bprev, (long)osize,
|
2011-06-15 23:19:09 +00:00
|
|
|
ip->i_number, vp->v_type, NULL);
|
2007-02-23 20:23:35 +00:00
|
|
|
delta = btodb(nsize - osize);
|
|
|
|
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + delta);
|
2009-01-27 21:48:47 +00:00
|
|
|
if (flags & IO_EXT)
|
|
|
|
ip->i_flag |= IN_CHANGE;
|
|
|
|
else
|
|
|
|
ip->i_flag |= IN_CHANGE | IN_UPDATE;
|
1995-03-26 23:29:13 +00:00
|
|
|
allocbuf(bp, nsize);
|
1994-05-24 10:09:53 +00:00
|
|
|
bp->b_flags |= B_DONE;
|
2013-03-19 15:08:15 +00:00
|
|
|
vfs_bio_bzero_buf(bp, osize, nsize - osize);
|
2009-05-17 20:26:00 +00:00
|
|
|
if ((bp->b_flags & (B_MALLOC | B_VMIO)) == B_VMIO)
|
|
|
|
vfs_bio_set_valid(bp, osize, nsize - osize);
|
1994-05-24 10:09:53 +00:00
|
|
|
*bpp = bp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
#ifdef QUOTA
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Restore user's disk quota because allocation failed.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
|
|
|
nospace:
|
|
|
|
/*
|
|
|
|
* no space available
|
|
|
|
*/
|
2011-06-10 22:48:35 +00:00
|
|
|
if (reclaimed == 0 && (flags & IO_BUFLOCKED) == 0) {
|
2002-01-22 06:17:22 +00:00
|
|
|
reclaimed = 1;
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
2010-02-13 10:34:50 +00:00
|
|
|
if (bp) {
|
2005-01-24 10:08:35 +00:00
|
|
|
brelse(bp);
|
2010-02-13 10:34:50 +00:00
|
|
|
bp = NULL;
|
|
|
|
}
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
2011-06-05 22:36:30 +00:00
|
|
|
softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT);
|
2002-01-22 06:17:22 +00:00
|
|
|
goto retry;
|
|
|
|
}
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
|
|
|
if (bp)
|
|
|
|
brelse(bp);
|
2011-06-15 18:05:08 +00:00
|
|
|
if (reclaimed > 0 && ppsratecheck(&lastfail, &curfail, 1)) {
|
2005-10-31 20:33:28 +00:00
|
|
|
ffs_fserr(fs, ip->i_number, "filesystem full");
|
|
|
|
uprintf("\n%s: write failed, filesystem is full\n",
|
|
|
|
fs->fs_fsmnt);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
return (ENOSPC);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reallocate a sequence of blocks into a contiguous sequence of blocks.
|
|
|
|
*
|
|
|
|
* The vnode and an array of buffer pointers for a range of sequential
|
|
|
|
* logical blocks to be made contiguous is given. The allocator attempts
|
2002-06-21 06:18:05 +00:00
|
|
|
* to find a range of sequential blocks starting as close as possible
|
|
|
|
* from the end of the allocation for the logical block immediately
|
|
|
|
* preceding the current range. If successful, the physical block numbers
|
|
|
|
* in the buffer pointers and in the inode are changed to reflect the new
|
|
|
|
* allocation. If unsuccessful, the allocation is left unchanged. The
|
|
|
|
* success in doing the reallocation is returned. Note that the error
|
|
|
|
* return is not reflected back to the user. Rather the previous block
|
|
|
|
* allocation will be used.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2001-03-21 04:09:01 +00:00
|
|
|
|
|
|
|
SYSCTL_NODE(_vfs, OID_AUTO, ffs, CTLFLAG_RW, 0, "FFS filesystem");
|
|
|
|
|
1995-12-17 21:14:36 +00:00
|
|
|
static int doasyncfree = 1;
|
2015-04-24 23:27:50 +00:00
|
|
|
SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0,
|
|
|
|
"do not force synchronous writes when blocks are reallocated");
|
1997-02-10 02:22:35 +00:00
|
|
|
|
1997-11-22 08:35:46 +00:00
|
|
|
static int doreallocblks = 1;
|
2015-04-24 23:27:50 +00:00
|
|
|
SYSCTL_INT(_vfs_ffs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0,
|
|
|
|
"enable block reallocation");
|
|
|
|
|
|
|
|
static int maxclustersearch = 10;
|
|
|
|
SYSCTL_INT(_vfs_ffs, OID_AUTO, maxclustersearch, CTLFLAG_RW, &maxclustersearch,
|
|
|
|
0, "max number of cylinder group to search for contigous blocks");
|
1997-02-10 02:22:35 +00:00
|
|
|
|
1999-01-06 17:04:33 +00:00
|
|
|
#ifdef DEBUG
|
|
|
|
static volatile int prtrealloc = 0;
|
|
|
|
#endif
|
1997-11-22 07:00:40 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
int
|
|
|
|
ffs_reallocblks(ap)
|
|
|
|
struct vop_reallocblks_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
struct cluster_save *a_buflist;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
2015-10-16 03:06:02 +00:00
|
|
|
struct ufsmount *ump;
|
2002-06-21 06:18:05 +00:00
|
|
|
|
2015-10-16 03:06:02 +00:00
|
|
|
/*
|
|
|
|
* If the underlying device can do deletes, then skip reallocating
|
|
|
|
* the blocks of this file into contiguous sequences. Devices that
|
|
|
|
* benefit from BIO_DELETE also benefit from not moving the data.
|
|
|
|
* These devices are flash and therefore work less well with this
|
|
|
|
* optimization. Also skip if reallocblks has been disabled globally.
|
|
|
|
*/
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ap->a_vp->v_mount->mnt_data;
|
2015-10-16 03:06:02 +00:00
|
|
|
if (ump->um_candelete || doreallocblks == 0)
|
2002-06-21 06:18:05 +00:00
|
|
|
return (ENOSPC);
|
2015-10-16 03:06:02 +00:00
|
|
|
|
2010-04-24 07:05:35 +00:00
|
|
|
/*
|
|
|
|
* We can't wait in softdep prealloc as it may fsync and recurse
|
|
|
|
* here. Instead we simply fail to reallocate blocks if this
|
|
|
|
* rare condition arises.
|
|
|
|
*/
|
|
|
|
if (DOINGSOFTDEP(ap->a_vp))
|
|
|
|
if (softdep_prealloc(ap->a_vp, MNT_NOWAIT) != 0)
|
|
|
|
return (ENOSPC);
|
2015-10-16 03:06:02 +00:00
|
|
|
if (ump->um_fstype == UFS1)
|
2002-06-21 06:18:05 +00:00
|
|
|
return (ffs_reallocblks_ufs1(ap));
|
|
|
|
return (ffs_reallocblks_ufs2(ap));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
ffs_reallocblks_ufs1(ap)
|
|
|
|
struct vop_reallocblks_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
struct cluster_save *a_buflist;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
1994-05-24 10:09:53 +00:00
|
|
|
struct fs *fs;
|
|
|
|
struct inode *ip;
|
|
|
|
struct vnode *vp;
|
|
|
|
struct buf *sbp, *ebp;
|
2016-04-10 21:48:11 +00:00
|
|
|
ufs1_daddr_t *bap, *sbap, *ebap;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct cluster_save *buflist;
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs_lbn_t start_lbn, end_lbn;
|
|
|
|
ufs1_daddr_t soff, newblk, blkno;
|
|
|
|
ufs2_daddr_t pref;
|
2017-02-15 19:50:26 +00:00
|
|
|
struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
|
2015-04-24 23:27:50 +00:00
|
|
|
int i, cg, len, start_lvl, end_lvl, ssize;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
vp = ap->a_vp;
|
|
|
|
ip = VTOI(vp);
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ITOUMP(ip);
|
|
|
|
fs = ump->um_fs;
|
2013-08-28 17:38:05 +00:00
|
|
|
/*
|
2013-12-30 17:04:24 +00:00
|
|
|
* If we are not tracking block clusters or if we have less than 4%
|
2013-08-28 17:38:05 +00:00
|
|
|
* free blocks left, then do not attempt to cluster. Running with
|
|
|
|
* less than 5% free block reserve is not recommended and those that
|
|
|
|
* choose to do so do not expect to have good file layout.
|
|
|
|
*/
|
2013-12-30 17:04:24 +00:00
|
|
|
if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
return (ENOSPC);
|
|
|
|
buflist = ap->a_buflist;
|
|
|
|
len = buflist->bs_nchildren;
|
|
|
|
start_lbn = buflist->bs_children[0]->b_lblkno;
|
|
|
|
end_lbn = start_lbn + len - 1;
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
1997-02-10 02:22:35 +00:00
|
|
|
for (i = 0; i < len; i++)
|
|
|
|
if (!ffs_checkblk(ip,
|
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
|
|
|
|
panic("ffs_reallocblks: unallocated block 1");
|
1994-05-24 10:09:53 +00:00
|
|
|
for (i = 1; i < len; i++)
|
|
|
|
if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
|
1997-02-10 02:22:35 +00:00
|
|
|
panic("ffs_reallocblks: non-logical cluster");
|
|
|
|
blkno = buflist->bs_children[0]->b_blkno;
|
|
|
|
ssize = fsbtodb(fs, fs->fs_frag);
|
|
|
|
for (i = 1; i < len - 1; i++)
|
|
|
|
if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
|
|
|
|
panic("ffs_reallocblks: non-physical cluster %d", i);
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
2012-11-03 18:55:55 +00:00
|
|
|
/*
|
|
|
|
* If the cluster crosses the boundary for the first indirect
|
|
|
|
* block, leave space for the indirect block. Indirect blocks
|
|
|
|
* are initially laid out in a position after the last direct
|
|
|
|
* block. Block reallocation would usually destroy locality by
|
|
|
|
* moving the indirect block out of the way to make room for
|
|
|
|
* data blocks if we didn't compensate here. We should also do
|
|
|
|
* this for other indirect block boundaries, but it is only
|
|
|
|
* important for the first one.
|
|
|
|
*/
|
2017-02-15 19:50:26 +00:00
|
|
|
if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
|
2012-11-03 18:55:55 +00:00
|
|
|
return (ENOSPC);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* If the latest allocation is in a new cylinder group, assume that
|
|
|
|
* the filesystem has decided to move and do not force it back to
|
|
|
|
* the previous cylinder group.
|
|
|
|
*/
|
|
|
|
if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
|
|
|
|
dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
|
|
|
|
return (ENOSPC);
|
|
|
|
if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
|
|
|
|
ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
|
|
|
|
return (ENOSPC);
|
|
|
|
/*
|
|
|
|
* Get the starting offset and block map for the first block.
|
|
|
|
*/
|
|
|
|
if (start_lvl == 0) {
|
2002-06-21 06:18:05 +00:00
|
|
|
sbap = &ip->i_din1->di_db[0];
|
1994-05-24 10:09:53 +00:00
|
|
|
soff = start_lbn;
|
|
|
|
} else {
|
|
|
|
idp = &start_ap[start_lvl - 1];
|
|
|
|
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
|
|
|
|
brelse(sbp);
|
|
|
|
return (ENOSPC);
|
|
|
|
}
|
2002-06-21 06:18:05 +00:00
|
|
|
sbap = (ufs1_daddr_t *)sbp->b_data;
|
1994-05-24 10:09:53 +00:00
|
|
|
soff = idp->in_off;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If the block range spans two block maps, get the second map.
|
|
|
|
*/
|
2016-04-10 21:48:11 +00:00
|
|
|
ebap = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
|
|
|
|
ssize = len;
|
|
|
|
} else {
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
2007-12-01 13:12:43 +00:00
|
|
|
if (start_lvl > 0 &&
|
|
|
|
start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("ffs_reallocblk: start == end");
|
|
|
|
#endif
|
|
|
|
ssize = len - (idp->in_off + 1);
|
|
|
|
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
|
|
|
|
goto fail;
|
2002-06-21 06:18:05 +00:00
|
|
|
ebap = (ufs1_daddr_t *)ebp->b_data;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-01-24 10:08:35 +00:00
|
|
|
/*
|
2015-04-24 23:27:50 +00:00
|
|
|
* Find the preferred location for the cluster. If we have not
|
|
|
|
* previously failed at this endeavor, then follow our standard
|
|
|
|
* preference calculation. If we have failed at it, then pick up
|
|
|
|
* where we last ended our search.
|
2005-01-24 10:08:35 +00:00
|
|
|
*/
|
|
|
|
UFS_LOCK(ump);
|
2015-04-24 23:27:50 +00:00
|
|
|
if (ip->i_nextclustercg == -1)
|
|
|
|
pref = ffs_blkpref_ufs1(ip, start_lbn, soff, sbap);
|
|
|
|
else
|
|
|
|
pref = cgdata(fs, ip->i_nextclustercg);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Search the block map looking for an allocation of the desired size.
|
2015-04-24 23:27:50 +00:00
|
|
|
* To avoid wasting too much time, we limit the number of cylinder
|
|
|
|
* groups that we will search.
|
|
|
|
*/
|
|
|
|
cg = dtog(fs, pref);
|
|
|
|
for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
|
|
|
|
if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
|
|
|
|
break;
|
|
|
|
cg += 1;
|
|
|
|
if (cg >= fs->fs_ncg)
|
|
|
|
cg = 0;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we have failed in our search, record where we gave up for
|
|
|
|
* next time. Otherwise, fall back to our usual search citerion.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2015-04-24 23:27:50 +00:00
|
|
|
if (newblk == 0) {
|
|
|
|
ip->i_nextclustercg = cg;
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
goto fail;
|
2005-01-24 10:08:35 +00:00
|
|
|
}
|
2015-04-24 23:27:50 +00:00
|
|
|
ip->i_nextclustercg = -1;
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* We have found a new contiguous block.
|
|
|
|
*
|
|
|
|
* First we have to replace the old block pointers with the new
|
|
|
|
* block pointers in the inode and indirect blocks associated
|
|
|
|
* with the file.
|
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc)
|
2012-09-27 23:30:49 +00:00
|
|
|
printf("realloc: ino %ju, lbns %jd-%jd\n\told:",
|
|
|
|
(uintmax_t)ip->i_number,
|
2002-06-21 06:18:05 +00:00
|
|
|
(intmax_t)start_lbn, (intmax_t)end_lbn);
|
1997-02-10 02:22:35 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
blkno = newblk;
|
|
|
|
for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
|
1998-03-08 09:59:44 +00:00
|
|
|
if (i == ssize) {
|
1994-05-24 10:09:53 +00:00
|
|
|
bap = ebap;
|
1998-03-08 09:59:44 +00:00
|
|
|
soff = -i;
|
|
|
|
}
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
1997-02-10 02:22:35 +00:00
|
|
|
if (!ffs_checkblk(ip,
|
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
|
|
|
|
panic("ffs_reallocblks: unallocated block 2");
|
|
|
|
if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("ffs_reallocblks: alloc mismatch");
|
1997-02-10 02:22:35 +00:00
|
|
|
#endif
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc)
|
|
|
|
printf(" %d,", *bap);
|
1994-05-24 10:09:53 +00:00
|
|
|
#endif
|
1998-03-08 09:59:44 +00:00
|
|
|
if (DOINGSOFTDEP(vp)) {
|
2002-06-21 06:18:05 +00:00
|
|
|
if (sbap == &ip->i_din1->di_db[0] && i < ssize)
|
1998-03-08 09:59:44 +00:00
|
|
|
softdep_setup_allocdirect(ip, start_lbn + i,
|
|
|
|
blkno, *bap, fs->fs_bsize, fs->fs_bsize,
|
|
|
|
buflist->bs_children[i]);
|
|
|
|
else
|
|
|
|
softdep_setup_allocindir_page(ip, start_lbn + i,
|
|
|
|
i < ssize ? sbp : ebp, soff + i, blkno,
|
|
|
|
*bap, buflist->bs_children[i]);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
*bap++ = blkno;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Next we must write out the modified inode and indirect blocks.
|
|
|
|
* For strict correctness, the writes should be synchronous since
|
|
|
|
* the old block values may have been written to disk. In practise
|
1995-05-30 08:16:23 +00:00
|
|
|
* they are almost never written, but if we are concerned about
|
1994-05-24 10:09:53 +00:00
|
|
|
* strict correctness, the `doasyncfree' flag should be set to zero.
|
|
|
|
*
|
|
|
|
* The test on `doasyncfree' should be changed to test a flag
|
|
|
|
* that shows whether the associated buffers and inodes have
|
|
|
|
* been written. The flag should be set when the cluster is
|
|
|
|
* started and cleared whenever the buffer or inode is flushed.
|
|
|
|
* We can then check below to see if it is set, and do the
|
|
|
|
* synchronous write only when it has been cleared.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
if (sbap != &ip->i_din1->di_db[0]) {
|
1994-05-24 10:09:53 +00:00
|
|
|
if (doasyncfree)
|
|
|
|
bdwrite(sbp);
|
|
|
|
else
|
|
|
|
bwrite(sbp);
|
|
|
|
} else {
|
|
|
|
ip->i_flag |= IN_CHANGE | IN_UPDATE;
|
1999-01-07 16:14:19 +00:00
|
|
|
if (!doasyncfree)
|
2005-02-08 17:40:01 +00:00
|
|
|
ffs_update(vp, 1);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1999-05-06 18:13:11 +00:00
|
|
|
if (ssize < len) {
|
1994-05-24 10:09:53 +00:00
|
|
|
if (doasyncfree)
|
|
|
|
bdwrite(ebp);
|
|
|
|
else
|
|
|
|
bwrite(ebp);
|
1999-05-06 18:13:11 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Last, free the old blocks and assign the new blocks to the buffers.
|
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc)
|
|
|
|
printf("\n\tnew:");
|
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
|
1998-03-08 09:59:44 +00:00
|
|
|
if (!DOINGSOFTDEP(vp))
|
2016-09-17 16:47:34 +00:00
|
|
|
ffs_blkfree(ump, fs, ump->um_devvp,
|
1998-03-08 09:59:44 +00:00
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
|
2011-06-15 23:19:09 +00:00
|
|
|
fs->fs_bsize, ip->i_number, vp->v_type, NULL);
|
1994-05-24 10:09:53 +00:00
|
|
|
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
1997-02-10 02:22:35 +00:00
|
|
|
if (!ffs_checkblk(ip,
|
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
|
|
|
|
panic("ffs_reallocblks: unallocated block 3");
|
1999-08-24 08:39:41 +00:00
|
|
|
#endif
|
|
|
|
#ifdef DEBUG
|
1997-02-10 02:22:35 +00:00
|
|
|
if (prtrealloc)
|
|
|
|
printf(" %d,", blkno);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc) {
|
|
|
|
prtrealloc--;
|
|
|
|
printf("\n");
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1997-02-10 02:22:35 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
fail:
|
|
|
|
if (ssize < len)
|
|
|
|
brelse(ebp);
|
2002-06-21 06:18:05 +00:00
|
|
|
if (sbap != &ip->i_din1->di_db[0])
|
|
|
|
brelse(sbp);
|
|
|
|
return (ENOSPC);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int
|
|
|
|
ffs_reallocblks_ufs2(ap)
|
|
|
|
struct vop_reallocblks_args /* {
|
|
|
|
struct vnode *a_vp;
|
|
|
|
struct cluster_save *a_buflist;
|
|
|
|
} */ *ap;
|
|
|
|
{
|
|
|
|
struct fs *fs;
|
|
|
|
struct inode *ip;
|
|
|
|
struct vnode *vp;
|
|
|
|
struct buf *sbp, *ebp;
|
2016-04-10 21:48:11 +00:00
|
|
|
ufs2_daddr_t *bap, *sbap, *ebap;
|
2002-06-21 06:18:05 +00:00
|
|
|
struct cluster_save *buflist;
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs_lbn_t start_lbn, end_lbn;
|
|
|
|
ufs2_daddr_t soff, newblk, blkno, pref;
|
2017-02-15 19:50:26 +00:00
|
|
|
struct indir start_ap[UFS_NIADDR + 1], end_ap[UFS_NIADDR + 1], *idp;
|
2015-04-24 23:27:50 +00:00
|
|
|
int i, cg, len, start_lvl, end_lvl, ssize;
|
2002-06-21 06:18:05 +00:00
|
|
|
|
|
|
|
vp = ap->a_vp;
|
|
|
|
ip = VTOI(vp);
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ITOUMP(ip);
|
|
|
|
fs = ump->um_fs;
|
2013-08-28 17:38:05 +00:00
|
|
|
/*
|
2013-12-30 17:04:24 +00:00
|
|
|
* If we are not tracking block clusters or if we have less than 4%
|
2013-08-28 17:38:05 +00:00
|
|
|
* free blocks left, then do not attempt to cluster. Running with
|
|
|
|
* less than 5% free block reserve is not recommended and those that
|
|
|
|
* choose to do so do not expect to have good file layout.
|
|
|
|
*/
|
2013-12-30 17:04:24 +00:00
|
|
|
if (fs->fs_contigsumsize <= 0 || freespace(fs, 4) < 0)
|
2002-06-21 06:18:05 +00:00
|
|
|
return (ENOSPC);
|
|
|
|
buflist = ap->a_buflist;
|
|
|
|
len = buflist->bs_nchildren;
|
|
|
|
start_lbn = buflist->bs_children[0]->b_lblkno;
|
|
|
|
end_lbn = start_lbn + len - 1;
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
2002-06-21 06:18:05 +00:00
|
|
|
for (i = 0; i < len; i++)
|
|
|
|
if (!ffs_checkblk(ip,
|
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
|
|
|
|
panic("ffs_reallocblks: unallocated block 1");
|
|
|
|
for (i = 1; i < len; i++)
|
|
|
|
if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
|
|
|
|
panic("ffs_reallocblks: non-logical cluster");
|
|
|
|
blkno = buflist->bs_children[0]->b_blkno;
|
|
|
|
ssize = fsbtodb(fs, fs->fs_frag);
|
|
|
|
for (i = 1; i < len - 1; i++)
|
|
|
|
if (buflist->bs_children[i]->b_blkno != blkno + (i * ssize))
|
|
|
|
panic("ffs_reallocblks: non-physical cluster %d", i);
|
|
|
|
#endif
|
2012-11-03 18:55:55 +00:00
|
|
|
/*
|
|
|
|
* If the cluster crosses the boundary for the first indirect
|
|
|
|
* block, do not move anything in it. Indirect blocks are
|
|
|
|
* usually initially laid out in a position between the data
|
|
|
|
* blocks. Block reallocation would usually destroy locality by
|
|
|
|
* moving the indirect block out of the way to make room for
|
|
|
|
* data blocks if we didn't compensate here. We should also do
|
|
|
|
* this for other indirect block boundaries, but it is only
|
|
|
|
* important for the first one.
|
|
|
|
*/
|
2017-02-15 19:50:26 +00:00
|
|
|
if (start_lbn < UFS_NDADDR && end_lbn >= UFS_NDADDR)
|
2012-11-03 18:55:55 +00:00
|
|
|
return (ENOSPC);
|
2002-06-21 06:18:05 +00:00
|
|
|
/*
|
|
|
|
* If the latest allocation is in a new cylinder group, assume that
|
|
|
|
* the filesystem has decided to move and do not force it back to
|
|
|
|
* the previous cylinder group.
|
|
|
|
*/
|
|
|
|
if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
|
|
|
|
dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
|
|
|
|
return (ENOSPC);
|
|
|
|
if (ufs_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
|
|
|
|
ufs_getlbns(vp, end_lbn, end_ap, &end_lvl))
|
|
|
|
return (ENOSPC);
|
|
|
|
/*
|
|
|
|
* Get the starting offset and block map for the first block.
|
|
|
|
*/
|
|
|
|
if (start_lvl == 0) {
|
|
|
|
sbap = &ip->i_din2->di_db[0];
|
|
|
|
soff = start_lbn;
|
|
|
|
} else {
|
|
|
|
idp = &start_ap[start_lvl - 1];
|
|
|
|
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &sbp)) {
|
|
|
|
brelse(sbp);
|
|
|
|
return (ENOSPC);
|
|
|
|
}
|
|
|
|
sbap = (ufs2_daddr_t *)sbp->b_data;
|
|
|
|
soff = idp->in_off;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If the block range spans two block maps, get the second map.
|
|
|
|
*/
|
2016-04-10 21:48:11 +00:00
|
|
|
ebap = NULL;
|
2002-06-21 06:18:05 +00:00
|
|
|
if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
|
|
|
|
ssize = len;
|
|
|
|
} else {
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
2007-12-01 13:12:43 +00:00
|
|
|
if (start_lvl > 0 &&
|
|
|
|
start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
|
2002-06-21 06:18:05 +00:00
|
|
|
panic("ffs_reallocblk: start == end");
|
|
|
|
#endif
|
|
|
|
ssize = len - (idp->in_off + 1);
|
|
|
|
if (bread(vp, idp->in_lbn, (int)fs->fs_bsize, NOCRED, &ebp))
|
|
|
|
goto fail;
|
|
|
|
ebap = (ufs2_daddr_t *)ebp->b_data;
|
|
|
|
}
|
2005-01-24 10:08:35 +00:00
|
|
|
/*
|
2015-04-24 23:27:50 +00:00
|
|
|
* Find the preferred location for the cluster. If we have not
|
|
|
|
* previously failed at this endeavor, then follow our standard
|
|
|
|
* preference calculation. If we have failed at it, then pick up
|
|
|
|
* where we last ended our search.
|
2005-01-24 10:08:35 +00:00
|
|
|
*/
|
|
|
|
UFS_LOCK(ump);
|
2015-04-24 23:27:50 +00:00
|
|
|
if (ip->i_nextclustercg == -1)
|
|
|
|
pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap);
|
|
|
|
else
|
|
|
|
pref = cgdata(fs, ip->i_nextclustercg);
|
2002-06-21 06:18:05 +00:00
|
|
|
/*
|
|
|
|
* Search the block map looking for an allocation of the desired size.
|
2015-04-24 23:27:50 +00:00
|
|
|
* To avoid wasting too much time, we limit the number of cylinder
|
|
|
|
* groups that we will search.
|
|
|
|
*/
|
|
|
|
cg = dtog(fs, pref);
|
|
|
|
for (i = min(maxclustersearch, fs->fs_ncg); i > 0; i--) {
|
|
|
|
if ((newblk = ffs_clusteralloc(ip, cg, pref, len)) != 0)
|
|
|
|
break;
|
|
|
|
cg += 1;
|
|
|
|
if (cg >= fs->fs_ncg)
|
|
|
|
cg = 0;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* If we have failed in our search, record where we gave up for
|
|
|
|
* next time. Otherwise, fall back to our usual search citerion.
|
2002-06-21 06:18:05 +00:00
|
|
|
*/
|
2015-04-24 23:27:50 +00:00
|
|
|
if (newblk == 0) {
|
|
|
|
ip->i_nextclustercg = cg;
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
2002-06-21 06:18:05 +00:00
|
|
|
goto fail;
|
2005-01-24 10:08:35 +00:00
|
|
|
}
|
2015-04-24 23:27:50 +00:00
|
|
|
ip->i_nextclustercg = -1;
|
2002-06-21 06:18:05 +00:00
|
|
|
/*
|
|
|
|
* We have found a new contiguous block.
|
|
|
|
*
|
|
|
|
* First we have to replace the old block pointers with the new
|
|
|
|
* block pointers in the inode and indirect blocks associated
|
|
|
|
* with the file.
|
|
|
|
*/
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc)
|
2014-12-17 07:27:19 +00:00
|
|
|
printf("realloc: ino %ju, lbns %jd-%jd\n\told:", (uintmax_t)ip->i_number,
|
2002-06-21 06:18:05 +00:00
|
|
|
(intmax_t)start_lbn, (intmax_t)end_lbn);
|
|
|
|
#endif
|
|
|
|
blkno = newblk;
|
|
|
|
for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->fs_frag) {
|
|
|
|
if (i == ssize) {
|
|
|
|
bap = ebap;
|
|
|
|
soff = -i;
|
|
|
|
}
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
2002-06-21 06:18:05 +00:00
|
|
|
if (!ffs_checkblk(ip,
|
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
|
|
|
|
panic("ffs_reallocblks: unallocated block 2");
|
|
|
|
if (dbtofsb(fs, buflist->bs_children[i]->b_blkno) != *bap)
|
|
|
|
panic("ffs_reallocblks: alloc mismatch");
|
|
|
|
#endif
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc)
|
2002-09-19 03:55:30 +00:00
|
|
|
printf(" %jd,", (intmax_t)*bap);
|
2002-06-21 06:18:05 +00:00
|
|
|
#endif
|
|
|
|
if (DOINGSOFTDEP(vp)) {
|
|
|
|
if (sbap == &ip->i_din2->di_db[0] && i < ssize)
|
|
|
|
softdep_setup_allocdirect(ip, start_lbn + i,
|
|
|
|
blkno, *bap, fs->fs_bsize, fs->fs_bsize,
|
|
|
|
buflist->bs_children[i]);
|
|
|
|
else
|
|
|
|
softdep_setup_allocindir_page(ip, start_lbn + i,
|
|
|
|
i < ssize ? sbp : ebp, soff + i, blkno,
|
|
|
|
*bap, buflist->bs_children[i]);
|
|
|
|
}
|
|
|
|
*bap++ = blkno;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Next we must write out the modified inode and indirect blocks.
|
|
|
|
* For strict correctness, the writes should be synchronous since
|
|
|
|
* the old block values may have been written to disk. In practise
|
|
|
|
* they are almost never written, but if we are concerned about
|
|
|
|
* strict correctness, the `doasyncfree' flag should be set to zero.
|
|
|
|
*
|
|
|
|
* The test on `doasyncfree' should be changed to test a flag
|
|
|
|
* that shows whether the associated buffers and inodes have
|
|
|
|
* been written. The flag should be set when the cluster is
|
|
|
|
* started and cleared whenever the buffer or inode is flushed.
|
|
|
|
* We can then check below to see if it is set, and do the
|
|
|
|
* synchronous write only when it has been cleared.
|
|
|
|
*/
|
|
|
|
if (sbap != &ip->i_din2->di_db[0]) {
|
|
|
|
if (doasyncfree)
|
|
|
|
bdwrite(sbp);
|
|
|
|
else
|
|
|
|
bwrite(sbp);
|
|
|
|
} else {
|
|
|
|
ip->i_flag |= IN_CHANGE | IN_UPDATE;
|
|
|
|
if (!doasyncfree)
|
2005-02-08 17:40:01 +00:00
|
|
|
ffs_update(vp, 1);
|
2002-06-21 06:18:05 +00:00
|
|
|
}
|
|
|
|
if (ssize < len) {
|
|
|
|
if (doasyncfree)
|
|
|
|
bdwrite(ebp);
|
|
|
|
else
|
|
|
|
bwrite(ebp);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Last, free the old blocks and assign the new blocks to the buffers.
|
|
|
|
*/
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc)
|
|
|
|
printf("\n\tnew:");
|
|
|
|
#endif
|
|
|
|
for (blkno = newblk, i = 0; i < len; i++, blkno += fs->fs_frag) {
|
|
|
|
if (!DOINGSOFTDEP(vp))
|
2016-09-17 16:47:34 +00:00
|
|
|
ffs_blkfree(ump, fs, ump->um_devvp,
|
2002-06-21 06:18:05 +00:00
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno),
|
2011-06-15 23:19:09 +00:00
|
|
|
fs->fs_bsize, ip->i_number, vp->v_type, NULL);
|
2002-06-21 06:18:05 +00:00
|
|
|
buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
2002-06-21 06:18:05 +00:00
|
|
|
if (!ffs_checkblk(ip,
|
|
|
|
dbtofsb(fs, buflist->bs_children[i]->b_blkno), fs->fs_bsize))
|
|
|
|
panic("ffs_reallocblks: unallocated block 3");
|
|
|
|
#endif
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc)
|
2002-07-08 12:42:29 +00:00
|
|
|
printf(" %jd,", (intmax_t)blkno);
|
2002-06-21 06:18:05 +00:00
|
|
|
#endif
|
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (prtrealloc) {
|
|
|
|
prtrealloc--;
|
|
|
|
printf("\n");
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
return (0);
|
|
|
|
|
|
|
|
fail:
|
|
|
|
if (ssize < len)
|
|
|
|
brelse(ebp);
|
|
|
|
if (sbap != &ip->i_din2->di_db[0])
|
1994-05-24 10:09:53 +00:00
|
|
|
brelse(sbp);
|
|
|
|
return (ENOSPC);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Allocate an inode in the filesystem.
|
1995-05-30 08:16:23 +00:00
|
|
|
*
|
1994-05-24 10:09:53 +00:00
|
|
|
* If allocating a directory, use ffs_dirpref to select the inode.
|
|
|
|
* If allocating in a directory, the following hierarchy is followed:
|
|
|
|
* 1) allocate the preferred inode.
|
|
|
|
* 2) allocate an inode in the same cylinder group.
|
|
|
|
* 3) quadradically rehash into other cylinder groups, until an
|
|
|
|
* available inode is located.
|
2007-01-16 19:35:43 +00:00
|
|
|
* If no inode preference is given the following hierarchy is used
|
1994-05-24 10:09:53 +00:00
|
|
|
* to allocate an inode:
|
|
|
|
* 1) allocate an inode in cylinder group 0.
|
|
|
|
* 2) quadradically rehash into other cylinder groups, until an
|
|
|
|
* available inode is located.
|
|
|
|
*/
|
1994-05-25 09:21:21 +00:00
|
|
|
int
|
VFS mega cleanup commit (x/N)
1. Add new file "sys/kern/vfs_default.c" where default actions for
VOPs go. Implement proper defaults for ABORTOP, BWRITE, LEASE,
POLL, REVOKE and STRATEGY. Various stuff spread over the entire
tree belongs here.
2. Change VOP_BLKATOFF to a normal function in cd9660.
3. Kill VOP_BLKATOFF, VOP_TRUNCATE, VOP_VFREE, VOP_VALLOC. These
are private interface functions between UFS and the underlying
storage manager layer (FFS/LFS/MFS/EXT2FS). The functions now
live in struct ufsmount instead.
4. Remove a kludge of VOP_ functions in all filesystems, that did
nothing but obscure the simplicity and break the expandability.
If a filesystem doesn't implement VOP_FOO, it shouldn't have an
entry for it in its vnops table. The system will try to DTRT
if it is not implemented. There are still some cruft left, but
the bulk of it is done.
5. Fix another VCALL in vfs_cache.c (thanks Bruce!)
1997-10-16 10:50:27 +00:00
|
|
|
ffs_valloc(pvp, mode, cred, vpp)
|
|
|
|
struct vnode *pvp;
|
|
|
|
int mode;
|
|
|
|
struct ucred *cred;
|
|
|
|
struct vnode **vpp;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-05-13 09:22:31 +00:00
|
|
|
struct inode *pip;
|
|
|
|
struct fs *fs;
|
|
|
|
struct inode *ip;
|
2002-06-21 06:18:05 +00:00
|
|
|
struct timespec ts;
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
1994-05-24 10:09:53 +00:00
|
|
|
ino_t ino, ipref;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int cg;
|
2011-03-23 05:13:54 +00:00
|
|
|
int error, error1, reclaimed;
|
2005-10-31 20:33:28 +00:00
|
|
|
static struct timeval lastfail;
|
|
|
|
static int curfail;
|
1995-05-30 08:16:23 +00:00
|
|
|
|
VFS mega cleanup commit (x/N)
1. Add new file "sys/kern/vfs_default.c" where default actions for
VOPs go. Implement proper defaults for ABORTOP, BWRITE, LEASE,
POLL, REVOKE and STRATEGY. Various stuff spread over the entire
tree belongs here.
2. Change VOP_BLKATOFF to a normal function in cd9660.
3. Kill VOP_BLKATOFF, VOP_TRUNCATE, VOP_VFREE, VOP_VALLOC. These
are private interface functions between UFS and the underlying
storage manager layer (FFS/LFS/MFS/EXT2FS). The functions now
live in struct ufsmount instead.
4. Remove a kludge of VOP_ functions in all filesystems, that did
nothing but obscure the simplicity and break the expandability.
If a filesystem doesn't implement VOP_FOO, it shouldn't have an
entry for it in its vnops table. The system will try to DTRT
if it is not implemented. There are still some cruft left, but
the bulk of it is done.
5. Fix another VCALL in vfs_cache.c (thanks Bruce!)
1997-10-16 10:50:27 +00:00
|
|
|
*vpp = NULL;
|
1994-05-24 10:09:53 +00:00
|
|
|
pip = VTOI(pvp);
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ITOUMP(pip);
|
|
|
|
fs = ump->um_fs;
|
2005-01-24 10:08:35 +00:00
|
|
|
|
|
|
|
UFS_LOCK(ump);
|
2011-03-23 05:13:54 +00:00
|
|
|
reclaimed = 0;
|
|
|
|
retry:
|
1994-05-24 10:09:53 +00:00
|
|
|
if (fs->fs_cstotal.cs_nifree == 0)
|
|
|
|
goto noinodes;
|
|
|
|
|
2018-03-17 12:59:55 +00:00
|
|
|
if ((mode & IFMT) == IFDIR)
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
ipref = ffs_dirpref(pip);
|
1994-05-24 10:09:53 +00:00
|
|
|
else
|
|
|
|
ipref = pip->i_number;
|
2002-12-18 00:53:45 +00:00
|
|
|
if (ipref >= fs->fs_ncg * fs->fs_ipg)
|
1994-05-24 10:09:53 +00:00
|
|
|
ipref = 0;
|
|
|
|
cg = ino_to_cg(fs, ipref);
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
/*
|
|
|
|
* Track number of dirs created one after another
|
|
|
|
* in a same cg without intervening by files.
|
|
|
|
*/
|
2018-03-17 12:59:55 +00:00
|
|
|
if ((mode & IFMT) == IFDIR) {
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
if (fs->fs_contigdirs[cg] < 255)
|
|
|
|
fs->fs_contigdirs[cg]++;
|
|
|
|
} else {
|
|
|
|
if (fs->fs_contigdirs[cg] > 0)
|
|
|
|
fs->fs_contigdirs[cg]--;
|
|
|
|
}
|
2010-04-24 07:05:35 +00:00
|
|
|
ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0,
|
1995-12-15 03:36:25 +00:00
|
|
|
(allocfcn_t *)ffs_nodealloccg);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (ino == 0)
|
|
|
|
goto noinodes;
|
2005-02-08 17:40:01 +00:00
|
|
|
error = ffs_vget(pvp->v_mount, ino, LK_EXCLUSIVE, vpp);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (error) {
|
2008-08-28 09:19:50 +00:00
|
|
|
error1 = ffs_vgetf(pvp->v_mount, ino, LK_EXCLUSIVE, vpp,
|
|
|
|
FFSV_FORCEINSMQ);
|
2005-02-08 17:40:01 +00:00
|
|
|
ffs_vfree(pvp, ino, mode);
|
2008-08-28 09:19:50 +00:00
|
|
|
if (error1 == 0) {
|
|
|
|
ip = VTOI(*vpp);
|
|
|
|
if (ip->i_mode)
|
|
|
|
goto dup_alloc;
|
|
|
|
ip->i_flag |= IN_MODIFIED;
|
|
|
|
vput(*vpp);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
return (error);
|
|
|
|
}
|
VFS mega cleanup commit (x/N)
1. Add new file "sys/kern/vfs_default.c" where default actions for
VOPs go. Implement proper defaults for ABORTOP, BWRITE, LEASE,
POLL, REVOKE and STRATEGY. Various stuff spread over the entire
tree belongs here.
2. Change VOP_BLKATOFF to a normal function in cd9660.
3. Kill VOP_BLKATOFF, VOP_TRUNCATE, VOP_VFREE, VOP_VALLOC. These
are private interface functions between UFS and the underlying
storage manager layer (FFS/LFS/MFS/EXT2FS). The functions now
live in struct ufsmount instead.
4. Remove a kludge of VOP_ functions in all filesystems, that did
nothing but obscure the simplicity and break the expandability.
If a filesystem doesn't implement VOP_FOO, it shouldn't have an
entry for it in its vnops table. The system will try to DTRT
if it is not implemented. There are still some cruft left, but
the bulk of it is done.
5. Fix another VCALL in vfs_cache.c (thanks Bruce!)
1997-10-16 10:50:27 +00:00
|
|
|
ip = VTOI(*vpp);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (ip->i_mode) {
|
2008-08-28 09:19:50 +00:00
|
|
|
dup_alloc:
|
2014-12-17 07:27:19 +00:00
|
|
|
printf("mode = 0%o, inum = %ju, fs = %s\n",
|
|
|
|
ip->i_mode, (uintmax_t)ip->i_number, fs->fs_fsmnt);
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("ffs_valloc: dup alloc");
|
|
|
|
}
|
2002-06-21 06:18:05 +00:00
|
|
|
if (DIP(ip, i_blocks) && (fs->fs_flags & FS_UNCLEAN) == 0) { /* XXX */
|
1998-07-11 07:46:16 +00:00
|
|
|
printf("free inode %s/%lu had %ld blocks\n",
|
2002-06-21 06:18:05 +00:00
|
|
|
fs->fs_fsmnt, (u_long)ino, (long)DIP(ip, i_blocks));
|
2004-07-28 06:41:27 +00:00
|
|
|
DIP_SET(ip, i_blocks, 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
ip->i_flags = 0;
|
2004-07-28 06:41:27 +00:00
|
|
|
DIP_SET(ip, i_flags, 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Set up a new generation number for this inode.
|
|
|
|
*/
|
2016-05-22 14:31:20 +00:00
|
|
|
while (ip->i_gen == 0 || ++ip->i_gen == 0)
|
|
|
|
ip->i_gen = arc4random();
|
2004-07-28 06:41:27 +00:00
|
|
|
DIP_SET(ip, i_gen, ip->i_gen);
|
2002-06-21 06:18:05 +00:00
|
|
|
if (fs->fs_magic == FS_UFS2_MAGIC) {
|
|
|
|
vfs_timestamp(&ts);
|
2002-07-16 22:36:00 +00:00
|
|
|
ip->i_din2->di_birthtime = ts.tv_sec;
|
|
|
|
ip->i_din2->di_birthnsec = ts.tv_nsec;
|
2002-06-21 06:18:05 +00:00
|
|
|
}
|
VFS sometimes is unable to inactivate a vnode when vnode use count
goes to zero. E.g., the vnode might be only shared-locked at the time of
vput() call. Such vnodes are kept in the hash, so they can be found later.
If ffs_valloc() allocated an inode that has its vnode cached in hash, and
still owing the inactivation, then vget() call from ffs_valloc() clears
VI_OWEINACT, and then the vnode is reused for the newly allocated inode.
The problem is, the vnode is not reclaimed before it is put to the new
use. ffs_valloc() recycles vnode vm object, but this is not enough.
In particular, at least v_vflag should be cleared, and several bits of
UFS state need to be removed.
It is very inconvenient to call vgone() at this point. Instead, move
some parts of ufs_reclaim() into helper function ufs_prepare_reclaim(),
and call the helper from VOP_RECLAIM and ffs_valloc().
Reviewed by: mckusick
Tested by: pho
MFC after: 3 weeks
2011-04-24 10:47:56 +00:00
|
|
|
ufs_prepare_reclaim(*vpp);
|
2005-10-03 21:57:43 +00:00
|
|
|
ip->i_flag = 0;
|
VFS sometimes is unable to inactivate a vnode when vnode use count
goes to zero. E.g., the vnode might be only shared-locked at the time of
vput() call. Such vnodes are kept in the hash, so they can be found later.
If ffs_valloc() allocated an inode that has its vnode cached in hash, and
still owing the inactivation, then vget() call from ffs_valloc() clears
VI_OWEINACT, and then the vnode is reused for the newly allocated inode.
The problem is, the vnode is not reclaimed before it is put to the new
use. ffs_valloc() recycles vnode vm object, but this is not enough.
In particular, at least v_vflag should be cleared, and several bits of
UFS state need to be removed.
It is very inconvenient to call vgone() at this point. Instead, move
some parts of ufs_reclaim() into helper function ufs_prepare_reclaim(),
and call the helper from VOP_RECLAIM and ffs_valloc().
Reviewed by: mckusick
Tested by: pho
MFC after: 3 weeks
2011-04-24 10:47:56 +00:00
|
|
|
(*vpp)->v_vflag = 0;
|
2005-10-09 19:06:34 +00:00
|
|
|
(*vpp)->v_type = VNON;
|
2016-09-17 16:47:34 +00:00
|
|
|
if (fs->fs_magic == FS_UFS2_MAGIC) {
|
2005-10-09 19:06:34 +00:00
|
|
|
(*vpp)->v_op = &ffs_vnodeops2;
|
2016-09-17 16:47:34 +00:00
|
|
|
ip->i_flag |= IN_UFS2;
|
|
|
|
} else {
|
2005-10-09 19:06:34 +00:00
|
|
|
(*vpp)->v_op = &ffs_vnodeops1;
|
2016-09-17 16:47:34 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
return (0);
|
|
|
|
noinodes:
|
2011-05-28 15:07:29 +00:00
|
|
|
if (reclaimed == 0) {
|
2011-03-23 05:13:54 +00:00
|
|
|
reclaimed = 1;
|
2011-04-05 21:26:05 +00:00
|
|
|
softdep_request_cleanup(fs, pvp, cred, FLUSH_INODES_WAIT);
|
2011-03-23 05:13:54 +00:00
|
|
|
goto retry;
|
|
|
|
}
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
2005-10-31 20:33:28 +00:00
|
|
|
if (ppsratecheck(&lastfail, &curfail, 1)) {
|
|
|
|
ffs_fserr(fs, pip->i_number, "out of inodes");
|
|
|
|
uprintf("\n%s: create/symlink failed, no inodes free\n",
|
|
|
|
fs->fs_fsmnt);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
return (ENOSPC);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
* Find a cylinder group to place a directory.
|
|
|
|
*
|
|
|
|
* The policy implemented by this algorithm is to allocate a
|
|
|
|
* directory inode in the same cylinder group as its parent
|
|
|
|
* directory, but also to reserve space for its files inodes
|
|
|
|
* and data. Restrict the number of directories which may be
|
|
|
|
* allocated one after another in the same cylinder group
|
|
|
|
* without intervening allocation of files.
|
1994-05-24 10:09:53 +00:00
|
|
|
*
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
* If we allocate a first level directory then force allocation
|
|
|
|
* in another cylinder group.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
static ino_t
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
ffs_dirpref(pip)
|
|
|
|
struct inode *pip;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-05-13 09:22:31 +00:00
|
|
|
struct fs *fs;
|
2013-03-22 21:45:28 +00:00
|
|
|
int cg, prefcg, dirsize, cgsize;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int avgifree, avgbfree, avgndir, curdirsize;
|
|
|
|
u_int minifree, minbfree, maxndir;
|
|
|
|
u_int mincg, minndir;
|
|
|
|
u_int maxcontigdirs;
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
|
2016-09-17 16:47:34 +00:00
|
|
|
mtx_assert(UFS_MTX(ITOUMP(pip)), MA_OWNED);
|
|
|
|
fs = ITOFS(pip);
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
|
|
|
|
avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Force allocation in another cg if creating a first level dir.
|
|
|
|
*/
|
2002-08-04 10:29:36 +00:00
|
|
|
ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref");
|
|
|
|
if (ITOV(pip)->v_vflag & VV_ROOT) {
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
prefcg = arc4random() % fs->fs_ncg;
|
|
|
|
mincg = prefcg;
|
|
|
|
minndir = fs->fs_ipg;
|
|
|
|
for (cg = prefcg; cg < fs->fs_ncg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
|
|
|
|
mincg = cg;
|
|
|
|
minndir = fs->fs_cs(fs, cg).cs_ndir;
|
|
|
|
}
|
|
|
|
for (cg = 0; cg < prefcg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_ndir < minndir &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree >= avgifree &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
|
|
|
|
mincg = cg;
|
|
|
|
minndir = fs->fs_cs(fs, cg).cs_ndir;
|
|
|
|
}
|
|
|
|
return ((ino_t)(fs->fs_ipg * mincg));
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Count various limits which used for
|
|
|
|
* optimal allocation of a directory inode.
|
|
|
|
*/
|
|
|
|
maxndir = min(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
|
2003-10-31 07:25:06 +00:00
|
|
|
minifree = avgifree - avgifree / 4;
|
|
|
|
if (minifree < 1)
|
|
|
|
minifree = 1;
|
|
|
|
minbfree = avgbfree - avgbfree / 4;
|
|
|
|
if (minbfree < 1)
|
|
|
|
minbfree = 1;
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
cgsize = fs->fs_fsize * fs->fs_fpg;
|
|
|
|
dirsize = fs->fs_avgfilesize * fs->fs_avgfpdir;
|
|
|
|
curdirsize = avgndir ? (cgsize - avgbfree * fs->fs_bsize) / avgndir : 0;
|
|
|
|
if (dirsize < curdirsize)
|
|
|
|
dirsize = curdirsize;
|
2007-09-10 14:12:29 +00:00
|
|
|
if (dirsize <= 0)
|
|
|
|
maxcontigdirs = 0; /* dirsize overflowed */
|
|
|
|
else
|
|
|
|
maxcontigdirs = min((avgbfree * fs->fs_bsize) / dirsize, 255);
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
if (fs->fs_avgfpdir > 0)
|
|
|
|
maxcontigdirs = min(maxcontigdirs,
|
|
|
|
fs->fs_ipg / fs->fs_avgfpdir);
|
|
|
|
if (maxcontigdirs == 0)
|
|
|
|
maxcontigdirs = 1;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Limit number of dirs in one cg and reserve space for
|
|
|
|
* regular files, but only if we have no deficit in
|
|
|
|
* inodes or space.
|
2013-03-22 21:45:28 +00:00
|
|
|
*
|
|
|
|
* We are trying to find a suitable cylinder group nearby
|
|
|
|
* our preferred cylinder group to place a new directory.
|
|
|
|
* We scan from our preferred cylinder group forward looking
|
|
|
|
* for a cylinder group that meets our criterion. If we get
|
|
|
|
* to the final cylinder group and do not find anything,
|
2014-03-22 11:26:39 +00:00
|
|
|
* we start scanning forwards from the beginning of the
|
|
|
|
* filesystem. While it might seem sensible to start scanning
|
|
|
|
* backwards or even to alternate looking forward and backward,
|
|
|
|
* this approach fails badly when the filesystem is nearly full.
|
|
|
|
* Specifically, we first search all the areas that have no space
|
2016-04-29 20:43:51 +00:00
|
|
|
* and finally try the one preceding that. We repeat this on
|
2014-03-22 11:26:39 +00:00
|
|
|
* every request and in the case of the final block end up
|
|
|
|
* searching the entire filesystem. By jumping to the front
|
|
|
|
* of the filesystem, our future forward searches always look
|
|
|
|
* in new cylinder groups so finds every possible block after
|
|
|
|
* one pass over the filesystem.
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
*/
|
|
|
|
prefcg = ino_to_cg(fs, pip->i_number);
|
|
|
|
for (cg = prefcg; cg < fs->fs_ncg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree >= minifree &&
|
2014-03-02 02:52:34 +00:00
|
|
|
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
if (fs->fs_contigdirs[cg] < maxcontigdirs)
|
|
|
|
return ((ino_t)(fs->fs_ipg * cg));
|
|
|
|
}
|
2013-08-28 17:46:32 +00:00
|
|
|
for (cg = 0; cg < prefcg; cg++)
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree >= minifree &&
|
2014-03-02 02:52:34 +00:00
|
|
|
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
if (fs->fs_contigdirs[cg] < maxcontigdirs)
|
|
|
|
return ((ino_t)(fs->fs_ipg * cg));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
/*
|
|
|
|
* This is a backstop when we have deficit in space.
|
|
|
|
*/
|
|
|
|
for (cg = prefcg; cg < fs->fs_ncg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
|
|
|
|
return ((ino_t)(fs->fs_ipg * cg));
|
2013-08-28 17:46:32 +00:00
|
|
|
for (cg = 0; cg < prefcg; cg++)
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
|
|
|
|
break;
|
|
|
|
return ((ino_t)(fs->fs_ipg * cg));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Select the desired position for the next block in a file. The file is
|
|
|
|
* logically divided into sections. The first section is composed of the
|
2013-07-14 18:44:33 +00:00
|
|
|
* direct blocks and the next fs_maxbpg blocks. Each additional section
|
|
|
|
* contains fs_maxbpg blocks.
|
1995-05-30 08:16:23 +00:00
|
|
|
*
|
1994-05-24 10:09:53 +00:00
|
|
|
* If no blocks have been allocated in the first section, the policy is to
|
|
|
|
* request a block in the same cylinder group as the inode that describes
|
2013-03-22 21:45:28 +00:00
|
|
|
* the file. The first indirect is allocated immediately following the last
|
|
|
|
* direct block and the data blocks for the first indirect immediately
|
|
|
|
* follow it.
|
|
|
|
*
|
|
|
|
* If no blocks have been allocated in any other section, the indirect
|
|
|
|
* block(s) are allocated in the same cylinder group as its inode in an
|
|
|
|
* area reserved immediately following the inode blocks. The policy for
|
|
|
|
* the data blocks is to place them in a cylinder group with a greater than
|
|
|
|
* average number of free blocks. An appropriate cylinder group is found
|
1994-05-24 10:09:53 +00:00
|
|
|
* by using a rotor that sweeps the cylinder groups. When a new group of
|
|
|
|
* blocks is needed, the sweep begins in the cylinder group following the
|
|
|
|
* cylinder group from which the previous allocation was made. The sweep
|
|
|
|
* continues until a cylinder group with greater than the average number
|
|
|
|
* of free blocks is found. If the allocation is for the first block in an
|
2013-07-14 18:44:33 +00:00
|
|
|
* indirect block or the previous block is a hole, then the information on
|
|
|
|
* the previous allocation is unavailable; here a best guess is made based
|
|
|
|
* on the logical block number being allocated.
|
1995-05-30 08:16:23 +00:00
|
|
|
*
|
1994-05-24 10:09:53 +00:00
|
|
|
* If a section is already partially allocated, the policy is to
|
2013-07-14 18:44:33 +00:00
|
|
|
* allocate blocks contiguously within the section if possible.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t
|
|
|
|
ffs_blkpref_ufs1(ip, lbn, indx, bap)
|
1994-05-24 10:09:53 +00:00
|
|
|
struct inode *ip;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs_lbn_t lbn;
|
1994-05-24 10:09:53 +00:00
|
|
|
int indx;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs1_daddr_t *bap;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-05-13 09:22:31 +00:00
|
|
|
struct fs *fs;
|
2013-03-22 21:45:28 +00:00
|
|
|
u_int cg, inocg;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int avgbfree, startcg;
|
2012-11-03 18:55:55 +00:00
|
|
|
ufs2_daddr_t pref;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2013-03-22 21:45:28 +00:00
|
|
|
KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
|
2016-09-17 16:47:34 +00:00
|
|
|
mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
|
|
|
|
fs = ITOFS(ip);
|
2012-11-03 18:55:55 +00:00
|
|
|
/*
|
2013-03-22 21:45:28 +00:00
|
|
|
* Allocation of indirect blocks is indicated by passing negative
|
|
|
|
* values in indx: -1 for single indirect, -2 for double indirect,
|
|
|
|
* -3 for triple indirect. As noted below, we attempt to allocate
|
|
|
|
* the first indirect inline with the file data. For all later
|
|
|
|
* indirect blocks, the data is often allocated in other cylinder
|
|
|
|
* groups. However to speed random file access and to speed up
|
|
|
|
* fsck, the filesystem reserves the first fs_metaspace blocks
|
|
|
|
* (typically half of fs_minfree) of the data area of each cylinder
|
|
|
|
* group to hold these later indirect blocks.
|
|
|
|
*/
|
|
|
|
inocg = ino_to_cg(fs, ip->i_number);
|
|
|
|
if (indx < 0) {
|
|
|
|
/*
|
|
|
|
* Our preference for indirect blocks is the zone at the
|
|
|
|
* beginning of the inode's cylinder group data area that
|
|
|
|
* we try to reserve for indirect blocks.
|
|
|
|
*/
|
|
|
|
pref = cgmeta(fs, inocg);
|
|
|
|
/*
|
|
|
|
* If we are allocating the first indirect block, try to
|
|
|
|
* place it immediately following the last direct block.
|
|
|
|
*/
|
2017-02-15 19:50:26 +00:00
|
|
|
if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
|
|
|
|
ip->i_din1->di_db[UFS_NDADDR - 1] != 0)
|
|
|
|
pref = ip->i_din1->di_db[UFS_NDADDR - 1] + fs->fs_frag;
|
2013-03-22 21:45:28 +00:00
|
|
|
return (pref);
|
|
|
|
}
|
|
|
|
/*
|
2012-11-03 18:55:55 +00:00
|
|
|
* If we are allocating the first data block in the first indirect
|
2013-03-22 21:45:28 +00:00
|
|
|
* block and the indirect has been allocated in the data block area,
|
|
|
|
* try to place it immediately following the indirect block.
|
2012-11-03 18:55:55 +00:00
|
|
|
*/
|
2017-02-15 19:50:26 +00:00
|
|
|
if (lbn == UFS_NDADDR) {
|
2012-11-03 18:55:55 +00:00
|
|
|
pref = ip->i_din1->di_ib[0];
|
2013-03-22 21:45:28 +00:00
|
|
|
if (pref != 0 && pref >= cgdata(fs, inocg) &&
|
|
|
|
pref < cgbase(fs, inocg + 1))
|
2012-11-03 18:55:55 +00:00
|
|
|
return (pref + fs->fs_frag);
|
|
|
|
}
|
2013-03-22 21:45:28 +00:00
|
|
|
/*
|
|
|
|
* If we are at the beginning of a file, or we have already allocated
|
|
|
|
* the maximum number of blocks per cylinder group, or we do not
|
2016-04-29 20:43:51 +00:00
|
|
|
* have a block allocated immediately preceding us, then we need
|
2013-03-22 21:45:28 +00:00
|
|
|
* to decide where to start allocating new blocks.
|
|
|
|
*/
|
1994-05-24 10:09:53 +00:00
|
|
|
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
|
2013-03-22 21:45:28 +00:00
|
|
|
/*
|
|
|
|
* If we are allocating a directory data block, we want
|
|
|
|
* to place it in the metadata area.
|
|
|
|
*/
|
2018-03-17 12:59:55 +00:00
|
|
|
if ((ip->i_mode & IFMT) == IFDIR)
|
2013-03-22 21:45:28 +00:00
|
|
|
return (cgmeta(fs, inocg));
|
|
|
|
/*
|
|
|
|
* Until we fill all the direct and all the first indirect's
|
|
|
|
* blocks, we try to allocate in the data area of the inode's
|
|
|
|
* cylinder group.
|
|
|
|
*/
|
2017-02-15 19:50:26 +00:00
|
|
|
if (lbn < UFS_NDADDR + NINDIR(fs))
|
2013-03-22 21:45:28 +00:00
|
|
|
return (cgdata(fs, inocg));
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Find a cylinder with greater than average number of
|
|
|
|
* unused data blocks.
|
|
|
|
*/
|
|
|
|
if (indx == 0 || bap[indx - 1] == 0)
|
2013-03-22 21:45:28 +00:00
|
|
|
startcg = inocg + lbn / fs->fs_maxbpg;
|
1994-05-24 10:09:53 +00:00
|
|
|
else
|
|
|
|
startcg = dtog(fs, bap[indx - 1]) + 1;
|
|
|
|
startcg %= fs->fs_ncg;
|
|
|
|
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
|
|
|
|
for (cg = startcg; cg < fs->fs_ncg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
|
|
|
|
fs->fs_cgrotor = cg;
|
2013-03-22 21:45:28 +00:00
|
|
|
return (cgdata(fs, cg));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
for (cg = 0; cg <= startcg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
|
|
|
|
fs->fs_cgrotor = cg;
|
2013-03-22 21:45:28 +00:00
|
|
|
return (cgdata(fs, cg));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
1996-07-12 04:12:25 +00:00
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
/*
|
2013-03-22 21:45:28 +00:00
|
|
|
* Otherwise, we just always try to lay things out contiguously.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
return (bap[indx - 1] + fs->fs_frag);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Same as above, but for UFS2
|
|
|
|
*/
|
|
|
|
ufs2_daddr_t
|
|
|
|
ffs_blkpref_ufs2(ip, lbn, indx, bap)
|
|
|
|
struct inode *ip;
|
|
|
|
ufs_lbn_t lbn;
|
|
|
|
int indx;
|
|
|
|
ufs2_daddr_t *bap;
|
|
|
|
{
|
|
|
|
struct fs *fs;
|
2013-03-22 21:45:28 +00:00
|
|
|
u_int cg, inocg;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int avgbfree, startcg;
|
2012-11-03 18:55:55 +00:00
|
|
|
ufs2_daddr_t pref;
|
2002-06-21 06:18:05 +00:00
|
|
|
|
2013-03-22 21:45:28 +00:00
|
|
|
KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
|
2016-09-17 16:47:34 +00:00
|
|
|
mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
|
|
|
|
fs = ITOFS(ip);
|
2012-11-03 18:55:55 +00:00
|
|
|
/*
|
2013-03-22 21:45:28 +00:00
|
|
|
* Allocation of indirect blocks is indicated by passing negative
|
|
|
|
* values in indx: -1 for single indirect, -2 for double indirect,
|
|
|
|
* -3 for triple indirect. As noted below, we attempt to allocate
|
|
|
|
* the first indirect inline with the file data. For all later
|
|
|
|
* indirect blocks, the data is often allocated in other cylinder
|
|
|
|
* groups. However to speed random file access and to speed up
|
|
|
|
* fsck, the filesystem reserves the first fs_metaspace blocks
|
|
|
|
* (typically half of fs_minfree) of the data area of each cylinder
|
|
|
|
* group to hold these later indirect blocks.
|
|
|
|
*/
|
|
|
|
inocg = ino_to_cg(fs, ip->i_number);
|
|
|
|
if (indx < 0) {
|
|
|
|
/*
|
|
|
|
* Our preference for indirect blocks is the zone at the
|
|
|
|
* beginning of the inode's cylinder group data area that
|
|
|
|
* we try to reserve for indirect blocks.
|
|
|
|
*/
|
|
|
|
pref = cgmeta(fs, inocg);
|
|
|
|
/*
|
|
|
|
* If we are allocating the first indirect block, try to
|
|
|
|
* place it immediately following the last direct block.
|
|
|
|
*/
|
2017-02-15 19:50:26 +00:00
|
|
|
if (indx == -1 && lbn < UFS_NDADDR + NINDIR(fs) &&
|
|
|
|
ip->i_din2->di_db[UFS_NDADDR - 1] != 0)
|
|
|
|
pref = ip->i_din2->di_db[UFS_NDADDR - 1] + fs->fs_frag;
|
2013-03-22 21:45:28 +00:00
|
|
|
return (pref);
|
|
|
|
}
|
|
|
|
/*
|
2012-11-03 18:55:55 +00:00
|
|
|
* If we are allocating the first data block in the first indirect
|
2013-03-22 21:45:28 +00:00
|
|
|
* block and the indirect has been allocated in the data block area,
|
|
|
|
* try to place it immediately following the indirect block.
|
2012-11-03 18:55:55 +00:00
|
|
|
*/
|
2017-02-15 19:50:26 +00:00
|
|
|
if (lbn == UFS_NDADDR) {
|
2013-03-22 21:45:28 +00:00
|
|
|
pref = ip->i_din2->di_ib[0];
|
|
|
|
if (pref != 0 && pref >= cgdata(fs, inocg) &&
|
|
|
|
pref < cgbase(fs, inocg + 1))
|
2012-11-03 18:55:55 +00:00
|
|
|
return (pref + fs->fs_frag);
|
|
|
|
}
|
2013-03-22 21:45:28 +00:00
|
|
|
/*
|
|
|
|
* If we are at the beginning of a file, or we have already allocated
|
|
|
|
* the maximum number of blocks per cylinder group, or we do not
|
2016-04-29 20:43:51 +00:00
|
|
|
* have a block allocated immediately preceding us, then we need
|
2013-03-22 21:45:28 +00:00
|
|
|
* to decide where to start allocating new blocks.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
|
2013-03-22 21:45:28 +00:00
|
|
|
/*
|
|
|
|
* If we are allocating a directory data block, we want
|
|
|
|
* to place it in the metadata area.
|
|
|
|
*/
|
2018-03-17 12:59:55 +00:00
|
|
|
if ((ip->i_mode & IFMT) == IFDIR)
|
2013-03-22 21:45:28 +00:00
|
|
|
return (cgmeta(fs, inocg));
|
|
|
|
/*
|
|
|
|
* Until we fill all the direct and all the first indirect's
|
|
|
|
* blocks, we try to allocate in the data area of the inode's
|
|
|
|
* cylinder group.
|
|
|
|
*/
|
2017-02-15 19:50:26 +00:00
|
|
|
if (lbn < UFS_NDADDR + NINDIR(fs))
|
2013-03-22 21:45:28 +00:00
|
|
|
return (cgdata(fs, inocg));
|
2002-06-21 06:18:05 +00:00
|
|
|
/*
|
|
|
|
* Find a cylinder with greater than average number of
|
|
|
|
* unused data blocks.
|
|
|
|
*/
|
|
|
|
if (indx == 0 || bap[indx - 1] == 0)
|
2013-03-22 21:45:28 +00:00
|
|
|
startcg = inocg + lbn / fs->fs_maxbpg;
|
2002-06-21 06:18:05 +00:00
|
|
|
else
|
|
|
|
startcg = dtog(fs, bap[indx - 1]) + 1;
|
|
|
|
startcg %= fs->fs_ncg;
|
|
|
|
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
|
|
|
|
for (cg = startcg; cg < fs->fs_ncg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
|
|
|
|
fs->fs_cgrotor = cg;
|
2013-03-22 21:45:28 +00:00
|
|
|
return (cgdata(fs, cg));
|
2002-06-21 06:18:05 +00:00
|
|
|
}
|
|
|
|
for (cg = 0; cg <= startcg; cg++)
|
|
|
|
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
|
|
|
|
fs->fs_cgrotor = cg;
|
2013-03-22 21:45:28 +00:00
|
|
|
return (cgdata(fs, cg));
|
2002-06-21 06:18:05 +00:00
|
|
|
}
|
|
|
|
return (0);
|
|
|
|
}
|
1995-09-08 17:16:32 +00:00
|
|
|
/*
|
2013-03-22 21:45:28 +00:00
|
|
|
* Otherwise, we just always try to lay things out contiguously.
|
1995-09-08 17:16:32 +00:00
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
return (bap[indx - 1] + fs->fs_frag);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Implement the cylinder overflow algorithm.
|
|
|
|
*
|
|
|
|
* The policy implemented by this algorithm is:
|
|
|
|
* 1) allocate the block in its requested cylinder group.
|
|
|
|
* 2) quadradically rehash on the cylinder group number.
|
|
|
|
* 3) brute force search for a free block.
|
2005-01-24 10:08:35 +00:00
|
|
|
*
|
|
|
|
* Must be called with the UFS lock held. Will release the lock on success
|
|
|
|
* and return with it held on failure.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
/*VARARGS5*/
|
2002-06-21 06:18:05 +00:00
|
|
|
static ufs2_daddr_t
|
2010-04-24 07:05:35 +00:00
|
|
|
ffs_hashalloc(ip, cg, pref, size, rsize, allocator)
|
1994-05-24 10:09:53 +00:00
|
|
|
struct inode *ip;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int cg;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t pref;
|
2010-04-24 07:05:35 +00:00
|
|
|
int size; /* Search size for data blocks, mode for inodes */
|
|
|
|
int rsize; /* Real allocated size. */
|
1995-12-03 11:17:15 +00:00
|
|
|
allocfcn_t *allocator;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-05-13 09:22:31 +00:00
|
|
|
struct fs *fs;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t result;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int i, icg = cg;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2016-09-17 16:47:34 +00:00
|
|
|
mtx_assert(UFS_MTX(ITOUMP(ip)), MA_OWNED);
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
2000-07-11 22:07:57 +00:00
|
|
|
if (ITOV(ip)->v_mount->mnt_kern_flag & MNTK_SUSPENDED)
|
|
|
|
panic("ffs_hashalloc: allocation on suspended filesystem");
|
|
|
|
#endif
|
2016-09-17 16:47:34 +00:00
|
|
|
fs = ITOFS(ip);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* 1: preferred cylinder group
|
|
|
|
*/
|
2010-04-24 07:05:35 +00:00
|
|
|
result = (*allocator)(ip, cg, pref, size, rsize);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (result)
|
|
|
|
return (result);
|
|
|
|
/*
|
|
|
|
* 2: quadratic rehash
|
|
|
|
*/
|
|
|
|
for (i = 1; i < fs->fs_ncg; i *= 2) {
|
|
|
|
cg += i;
|
|
|
|
if (cg >= fs->fs_ncg)
|
|
|
|
cg -= fs->fs_ncg;
|
2010-04-24 07:05:35 +00:00
|
|
|
result = (*allocator)(ip, cg, 0, size, rsize);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (result)
|
|
|
|
return (result);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* 3: brute force search
|
|
|
|
* Note that we start at i == 2, since 0 was checked initially,
|
|
|
|
* and 1 is always checked in the quadratic rehash.
|
|
|
|
*/
|
|
|
|
cg = (icg + 2) % fs->fs_ncg;
|
|
|
|
for (i = 2; i < fs->fs_ncg; i++) {
|
2010-04-24 07:05:35 +00:00
|
|
|
result = (*allocator)(ip, cg, 0, size, rsize);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (result)
|
|
|
|
return (result);
|
|
|
|
cg++;
|
|
|
|
if (cg == fs->fs_ncg)
|
|
|
|
cg = 0;
|
|
|
|
}
|
1995-12-03 11:17:15 +00:00
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine whether a fragment can be extended.
|
|
|
|
*
|
1995-05-30 08:16:23 +00:00
|
|
|
* Check to see if the necessary fragments are available, and
|
1994-05-24 10:09:53 +00:00
|
|
|
* if they are, allocate them.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
static ufs2_daddr_t
|
1994-05-24 10:09:53 +00:00
|
|
|
ffs_fragextend(ip, cg, bprev, osize, nsize)
|
|
|
|
struct inode *ip;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int cg;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t bprev;
|
1994-05-24 10:09:53 +00:00
|
|
|
int osize, nsize;
|
|
|
|
{
|
2002-05-13 09:22:31 +00:00
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct buf *bp;
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
|
|
|
int nffree;
|
1994-05-24 10:09:53 +00:00
|
|
|
long bno;
|
|
|
|
int frags, bbase;
|
|
|
|
int i, error;
|
2000-03-15 07:08:36 +00:00
|
|
|
u_int8_t *blksfree;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ITOUMP(ip);
|
|
|
|
fs = ump->um_fs;
|
1994-05-24 10:09:53 +00:00
|
|
|
if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
|
1996-07-12 04:12:25 +00:00
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
frags = numfrags(fs, nsize);
|
|
|
|
bbase = fragnum(fs, bprev);
|
|
|
|
if (bbase > fragnum(fs, (bprev + frags - 1))) {
|
|
|
|
/* cannot extend across a block boundary */
|
1996-07-12 04:12:25 +00:00
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
2017-06-28 17:32:09 +00:00
|
|
|
if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0)
|
2005-01-24 10:08:35 +00:00
|
|
|
goto fail;
|
1994-05-24 10:09:53 +00:00
|
|
|
bno = dtogd(fs, bprev);
|
2000-03-15 07:08:36 +00:00
|
|
|
blksfree = cg_blksfree(cgp);
|
1994-05-24 10:09:53 +00:00
|
|
|
for (i = numfrags(fs, osize); i < frags; i++)
|
2005-01-24 10:08:35 +00:00
|
|
|
if (isclr(blksfree, bno + i))
|
|
|
|
goto fail;
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* the current fragment can be extended
|
|
|
|
* deduct the count on fragment being extended into
|
|
|
|
* increase the count on the remaining fragment (if any)
|
|
|
|
* allocate the extended piece
|
|
|
|
*/
|
|
|
|
for (i = frags; i < fs->fs_frag - bbase; i++)
|
2000-03-15 07:08:36 +00:00
|
|
|
if (isclr(blksfree, bno + i))
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
|
|
|
cgp->cg_frsum[i - numfrags(fs, osize)]--;
|
|
|
|
if (i != frags)
|
|
|
|
cgp->cg_frsum[i - frags]++;
|
2005-01-24 10:08:35 +00:00
|
|
|
for (i = numfrags(fs, osize), nffree = 0; i < frags; i++) {
|
2000-03-15 07:08:36 +00:00
|
|
|
clrbit(blksfree, bno + i);
|
1994-05-24 10:09:53 +00:00
|
|
|
cgp->cg_cs.cs_nffree--;
|
2005-01-24 10:08:35 +00:00
|
|
|
nffree++;
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
|
|
|
fs->fs_cstotal.cs_nffree -= nffree;
|
|
|
|
fs->fs_cs(fs, cg).cs_nffree -= nffree;
|
1994-05-24 10:09:53 +00:00
|
|
|
fs->fs_fmod = 1;
|
2005-01-24 10:08:35 +00:00
|
|
|
ACTIVECLEAR(fs, cg);
|
|
|
|
UFS_UNLOCK(ump);
|
1998-03-08 09:59:44 +00:00
|
|
|
if (DOINGSOFTDEP(ITOV(ip)))
|
2010-04-24 07:05:35 +00:00
|
|
|
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), bprev,
|
|
|
|
frags, numfrags(fs, osize));
|
1994-05-24 10:09:53 +00:00
|
|
|
bdwrite(bp);
|
|
|
|
return (bprev);
|
2005-01-24 10:08:35 +00:00
|
|
|
|
|
|
|
fail:
|
|
|
|
brelse(bp);
|
|
|
|
UFS_LOCK(ump);
|
|
|
|
return (0);
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine whether a block can be allocated.
|
|
|
|
*
|
|
|
|
* Check to see if a block of the appropriate size is available,
|
|
|
|
* and if it is, allocate it.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
static ufs2_daddr_t
|
2010-04-24 07:05:35 +00:00
|
|
|
ffs_alloccg(ip, cg, bpref, size, rsize)
|
1994-05-24 10:09:53 +00:00
|
|
|
struct inode *ip;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int cg;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t bpref;
|
1994-05-24 10:09:53 +00:00
|
|
|
int size;
|
2010-04-24 07:05:35 +00:00
|
|
|
int rsize;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-05-13 09:22:31 +00:00
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct buf *bp;
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs1_daddr_t bno;
|
|
|
|
ufs2_daddr_t blkno;
|
|
|
|
int i, allocsiz, error, frags;
|
2000-03-15 07:08:36 +00:00
|
|
|
u_int8_t *blksfree;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ITOUMP(ip);
|
|
|
|
fs = ump->um_fs;
|
1994-05-24 10:09:53 +00:00
|
|
|
if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
|
1996-07-12 04:12:25 +00:00
|
|
|
return (0);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
2017-06-28 17:32:09 +00:00
|
|
|
if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0 ||
|
|
|
|
(cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
|
2005-01-24 10:08:35 +00:00
|
|
|
goto fail;
|
1994-05-24 10:09:53 +00:00
|
|
|
if (size == fs->fs_bsize) {
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
2010-04-24 07:05:35 +00:00
|
|
|
blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
|
2005-01-24 10:08:35 +00:00
|
|
|
ACTIVECLEAR(fs, cg);
|
|
|
|
UFS_UNLOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
bdwrite(bp);
|
2002-06-21 06:18:05 +00:00
|
|
|
return (blkno);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
/*
|
|
|
|
* check to see if any fragments are already available
|
|
|
|
* allocsiz is the size which will be allocated, hacking
|
|
|
|
* it down to a smaller size if necessary
|
|
|
|
*/
|
2000-03-15 07:08:36 +00:00
|
|
|
blksfree = cg_blksfree(cgp);
|
1994-05-24 10:09:53 +00:00
|
|
|
frags = numfrags(fs, size);
|
|
|
|
for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++)
|
|
|
|
if (cgp->cg_frsum[allocsiz] != 0)
|
|
|
|
break;
|
|
|
|
if (allocsiz == fs->fs_frag) {
|
|
|
|
/*
|
1995-05-30 08:16:23 +00:00
|
|
|
* no fragments were available, so a block will be
|
1994-05-24 10:09:53 +00:00
|
|
|
* allocated, and hacked up
|
|
|
|
*/
|
2005-01-24 10:08:35 +00:00
|
|
|
if (cgp->cg_cs.cs_nbfree == 0)
|
|
|
|
goto fail;
|
|
|
|
UFS_LOCK(ump);
|
2010-04-24 07:05:35 +00:00
|
|
|
blkno = ffs_alloccgblk(ip, bp, bpref, rsize);
|
2005-01-24 10:08:35 +00:00
|
|
|
ACTIVECLEAR(fs, cg);
|
|
|
|
UFS_UNLOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
bdwrite(bp);
|
2002-06-21 06:18:05 +00:00
|
|
|
return (blkno);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2010-04-24 07:05:35 +00:00
|
|
|
KASSERT(size == rsize,
|
|
|
|
("ffs_alloccg: size(%d) != rsize(%d)", size, rsize));
|
1994-05-24 10:09:53 +00:00
|
|
|
bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
|
2005-01-24 10:08:35 +00:00
|
|
|
if (bno < 0)
|
|
|
|
goto fail;
|
1994-05-24 10:09:53 +00:00
|
|
|
for (i = 0; i < frags; i++)
|
2000-03-15 07:08:36 +00:00
|
|
|
clrbit(blksfree, bno + i);
|
1994-05-24 10:09:53 +00:00
|
|
|
cgp->cg_cs.cs_nffree -= frags;
|
|
|
|
cgp->cg_frsum[allocsiz]--;
|
|
|
|
if (frags != allocsiz)
|
|
|
|
cgp->cg_frsum[allocsiz - frags]++;
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
|
|
|
fs->fs_cstotal.cs_nffree -= frags;
|
|
|
|
fs->fs_cs(fs, cg).cs_nffree -= frags;
|
|
|
|
fs->fs_fmod = 1;
|
2004-12-09 21:24:00 +00:00
|
|
|
blkno = cgbase(fs, cg) + bno;
|
2005-01-24 10:08:35 +00:00
|
|
|
ACTIVECLEAR(fs, cg);
|
|
|
|
UFS_UNLOCK(ump);
|
1998-03-08 09:59:44 +00:00
|
|
|
if (DOINGSOFTDEP(ITOV(ip)))
|
2010-04-24 07:05:35 +00:00
|
|
|
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno, frags, 0);
|
1994-05-24 10:09:53 +00:00
|
|
|
bdwrite(bp);
|
2002-06-21 06:18:05 +00:00
|
|
|
return (blkno);
|
2005-01-24 10:08:35 +00:00
|
|
|
|
|
|
|
fail:
|
|
|
|
brelse(bp);
|
|
|
|
UFS_LOCK(ump);
|
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Allocate a block in a cylinder group.
|
|
|
|
*
|
|
|
|
* This algorithm implements the following policy:
|
|
|
|
* 1) allocate the requested block.
|
|
|
|
* 2) allocate a rotationally optimal block in the same cylinder.
|
|
|
|
* 3) allocate the next available block on the block rotor for the
|
|
|
|
* specified cylinder group.
|
|
|
|
* Note that this routine only allocates fs_bsize blocks; these
|
|
|
|
* blocks may be fragmented by the routine that allocates them.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
static ufs2_daddr_t
|
2010-04-24 07:05:35 +00:00
|
|
|
ffs_alloccgblk(ip, bp, bpref, size)
|
1998-03-08 09:59:44 +00:00
|
|
|
struct inode *ip;
|
|
|
|
struct buf *bp;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t bpref;
|
2010-04-24 07:05:35 +00:00
|
|
|
int size;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
1998-03-08 09:59:44 +00:00
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs1_daddr_t bno;
|
|
|
|
ufs2_daddr_t blkno;
|
2000-03-15 07:08:36 +00:00
|
|
|
u_int8_t *blksfree;
|
2013-03-22 21:45:28 +00:00
|
|
|
int i, cgbpref;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ITOUMP(ip);
|
|
|
|
fs = ump->um_fs;
|
2005-01-24 10:08:35 +00:00
|
|
|
mtx_assert(UFS_MTX(ump), MA_OWNED);
|
1998-03-08 09:59:44 +00:00
|
|
|
cgp = (struct cg *)bp->b_data;
|
2000-03-15 07:08:36 +00:00
|
|
|
blksfree = cg_blksfree(cgp);
|
2013-03-22 21:45:28 +00:00
|
|
|
if (bpref == 0) {
|
2013-07-02 21:07:08 +00:00
|
|
|
bpref = cgbase(fs, cgp->cg_cgx) + cgp->cg_rotor + fs->fs_frag;
|
2013-03-22 21:45:28 +00:00
|
|
|
} else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) {
|
|
|
|
/* map bpref to correct zone in this cg */
|
|
|
|
if (bpref < cgdata(fs, cgbpref))
|
|
|
|
bpref = cgmeta(fs, cgp->cg_cgx);
|
|
|
|
else
|
|
|
|
bpref = cgdata(fs, cgp->cg_cgx);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2013-03-22 21:45:28 +00:00
|
|
|
/*
|
|
|
|
* if the requested block is available, use it
|
|
|
|
*/
|
|
|
|
bno = dtogd(fs, blknum(fs, bpref));
|
|
|
|
if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
|
|
|
|
goto gotit;
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2002-06-21 06:18:05 +00:00
|
|
|
* Take the next available block in this cylinder group.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
|
|
|
|
if (bno < 0)
|
1996-07-12 04:12:25 +00:00
|
|
|
return (0);
|
2013-03-22 21:45:28 +00:00
|
|
|
/* Update cg_rotor only if allocated from the data zone */
|
|
|
|
if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx)))
|
|
|
|
cgp->cg_rotor = bno;
|
1994-05-24 10:09:53 +00:00
|
|
|
gotit:
|
|
|
|
blkno = fragstoblks(fs, bno);
|
2000-03-15 07:08:36 +00:00
|
|
|
ffs_clrblock(fs, blksfree, (long)blkno);
|
2010-04-24 07:05:35 +00:00
|
|
|
ffs_clusteracct(fs, cgp, blkno, -1);
|
1994-05-24 10:09:53 +00:00
|
|
|
cgp->cg_cs.cs_nbfree--;
|
|
|
|
fs->fs_cstotal.cs_nbfree--;
|
|
|
|
fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
|
|
|
|
fs->fs_fmod = 1;
|
2004-12-09 21:24:00 +00:00
|
|
|
blkno = cgbase(fs, cgp->cg_cgx) + bno;
|
2010-04-24 07:05:35 +00:00
|
|
|
/*
|
|
|
|
* If the caller didn't want the whole block free the frags here.
|
|
|
|
*/
|
|
|
|
size = numfrags(fs, size);
|
|
|
|
if (size != fs->fs_frag) {
|
|
|
|
bno = dtogd(fs, blkno);
|
|
|
|
for (i = size; i < fs->fs_frag; i++)
|
|
|
|
setbit(blksfree, bno + i);
|
|
|
|
i = fs->fs_frag - size;
|
|
|
|
cgp->cg_cs.cs_nffree += i;
|
|
|
|
fs->fs_cstotal.cs_nffree += i;
|
|
|
|
fs->fs_cs(fs, cgp->cg_cgx).cs_nffree += i;
|
|
|
|
fs->fs_fmod = 1;
|
|
|
|
cgp->cg_frsum[i]++;
|
|
|
|
}
|
2005-01-24 10:08:35 +00:00
|
|
|
/* XXX Fixme. */
|
|
|
|
UFS_UNLOCK(ump);
|
1998-03-08 09:59:44 +00:00
|
|
|
if (DOINGSOFTDEP(ITOV(ip)))
|
2010-04-24 07:05:35 +00:00
|
|
|
softdep_setup_blkmapdep(bp, UFSTOVFS(ump), blkno,
|
|
|
|
size, 0);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
1998-03-08 09:59:44 +00:00
|
|
|
return (blkno);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine whether a cluster can be allocated.
|
|
|
|
*
|
|
|
|
* We do not currently check for optimal rotational layout if there
|
|
|
|
* are multiple choices in the same cylinder group. Instead we just
|
|
|
|
* take the first one that we find following bpref.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
static ufs2_daddr_t
|
2015-04-24 23:27:50 +00:00
|
|
|
ffs_clusteralloc(ip, cg, bpref, len)
|
1994-05-24 10:09:53 +00:00
|
|
|
struct inode *ip;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int cg;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t bpref;
|
1994-05-24 10:09:53 +00:00
|
|
|
int len;
|
|
|
|
{
|
2002-05-13 09:22:31 +00:00
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct buf *bp;
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
2017-06-28 17:32:09 +00:00
|
|
|
int i, run, bit, map, got, error;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t bno;
|
1994-05-24 10:09:53 +00:00
|
|
|
u_char *mapp;
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t *lp;
|
2000-03-15 07:08:36 +00:00
|
|
|
u_int8_t *blksfree;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ITOUMP(ip);
|
|
|
|
fs = ump->um_fs;
|
1997-02-10 02:22:35 +00:00
|
|
|
if (fs->fs_maxcluster[cg] < len)
|
1999-12-21 11:14:12 +00:00
|
|
|
return (0);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
2017-06-28 17:32:09 +00:00
|
|
|
if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) {
|
|
|
|
UFS_LOCK(ump);
|
|
|
|
return (0);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Check to see if a cluster of the needed size (or bigger) is
|
|
|
|
* available in this cylinder group.
|
|
|
|
*/
|
1997-02-10 02:22:35 +00:00
|
|
|
lp = &cg_clustersum(cgp)[len];
|
1994-05-24 10:09:53 +00:00
|
|
|
for (i = len; i <= fs->fs_contigsumsize; i++)
|
1997-02-10 02:22:35 +00:00
|
|
|
if (*lp++ > 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
break;
|
1997-02-10 02:22:35 +00:00
|
|
|
if (i > fs->fs_contigsumsize) {
|
|
|
|
/*
|
|
|
|
* This is the first time looking for a cluster in this
|
|
|
|
* cylinder group. Update the cluster summary information
|
|
|
|
* to reflect the true maximum sized cluster so that
|
|
|
|
* future cluster allocation requests can avoid reading
|
|
|
|
* the cylinder group map only to find no clusters.
|
|
|
|
*/
|
|
|
|
lp = &cg_clustersum(cgp)[len - 1];
|
|
|
|
for (i = len - 1; i > 0; i--)
|
|
|
|
if (*lp-- > 0)
|
|
|
|
break;
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
1997-02-10 02:22:35 +00:00
|
|
|
fs->fs_maxcluster[cg] = i;
|
2017-06-28 17:32:09 +00:00
|
|
|
brelse(bp);
|
|
|
|
return (0);
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Search the cluster map to find a big enough cluster.
|
|
|
|
* We take the first one that we find, even if it is larger
|
|
|
|
* than we need as we prefer to get one close to the previous
|
|
|
|
* block allocation. We do not search before the current
|
|
|
|
* preference point as we do not want to allocate a block
|
|
|
|
* that is allocated before the previous one (as we will
|
|
|
|
* then have to wait for another pass of the elevator
|
|
|
|
* algorithm before it will be read). We prefer to fail and
|
|
|
|
* be recalled to try an allocation in the next cylinder group.
|
|
|
|
*/
|
|
|
|
if (dtog(fs, bpref) != cg)
|
2013-03-22 21:45:28 +00:00
|
|
|
bpref = cgdata(fs, cg);
|
1994-05-24 10:09:53 +00:00
|
|
|
else
|
2013-03-22 21:45:28 +00:00
|
|
|
bpref = blknum(fs, bpref);
|
|
|
|
bpref = fragstoblks(fs, dtogd(fs, bpref));
|
1994-05-24 10:09:53 +00:00
|
|
|
mapp = &cg_clustersfree(cgp)[bpref / NBBY];
|
|
|
|
map = *mapp++;
|
|
|
|
bit = 1 << (bpref % NBBY);
|
1997-02-10 02:22:35 +00:00
|
|
|
for (run = 0, got = bpref; got < cgp->cg_nclusterblks; got++) {
|
1994-05-24 10:09:53 +00:00
|
|
|
if ((map & bit) == 0) {
|
|
|
|
run = 0;
|
|
|
|
} else {
|
|
|
|
run++;
|
|
|
|
if (run == len)
|
|
|
|
break;
|
|
|
|
}
|
1997-02-10 02:22:35 +00:00
|
|
|
if ((got & (NBBY - 1)) != (NBBY - 1)) {
|
1994-05-24 10:09:53 +00:00
|
|
|
bit <<= 1;
|
|
|
|
} else {
|
|
|
|
map = *mapp++;
|
|
|
|
bit = 1;
|
|
|
|
}
|
|
|
|
}
|
2017-06-28 17:32:09 +00:00
|
|
|
if (got >= cgp->cg_nclusterblks) {
|
|
|
|
UFS_LOCK(ump);
|
|
|
|
brelse(bp);
|
|
|
|
return (0);
|
|
|
|
}
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Allocate the cluster that we have found.
|
|
|
|
*/
|
2000-03-15 07:08:36 +00:00
|
|
|
blksfree = cg_blksfree(cgp);
|
1997-02-10 02:22:35 +00:00
|
|
|
for (i = 1; i <= len; i++)
|
2000-03-15 07:08:36 +00:00
|
|
|
if (!ffs_isblock(fs, blksfree, got - run + i))
|
1997-02-10 02:22:35 +00:00
|
|
|
panic("ffs_clusteralloc: map mismatch");
|
2004-12-09 21:24:00 +00:00
|
|
|
bno = cgbase(fs, cg) + blkstofrags(fs, got - run + 1);
|
1997-02-10 02:22:35 +00:00
|
|
|
if (dtog(fs, bno) != cg)
|
|
|
|
panic("ffs_clusteralloc: allocated out of group");
|
1994-05-24 10:09:53 +00:00
|
|
|
len = blkstofrags(fs, len);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
for (i = 0; i < len; i += fs->fs_frag)
|
2010-04-24 07:05:35 +00:00
|
|
|
if (ffs_alloccgblk(ip, bp, bno + i, fs->fs_bsize) != bno + i)
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("ffs_clusteralloc: lost block");
|
2005-01-24 10:08:35 +00:00
|
|
|
ACTIVECLEAR(fs, cg);
|
|
|
|
UFS_UNLOCK(ump);
|
1995-08-07 08:16:32 +00:00
|
|
|
bdwrite(bp);
|
1994-05-24 10:09:53 +00:00
|
|
|
return (bno);
|
|
|
|
}
|
|
|
|
|
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.
Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block. The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.
In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it. After the write is finished, reread cg block and
update the cg_initediblk.
If inode block is already locked by another thread, let the another
thread initialize it. If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk. Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.
Sponsored by: The FreeBSD Foundation
In collaboration with: pho
Reviewed by: mckusick
MFC after: 1 month
X-MFC-note: after r246877
2013-02-27 07:31:23 +00:00
|
|
|
static inline struct buf *
|
|
|
|
getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags)
|
|
|
|
{
|
|
|
|
struct fs *fs;
|
|
|
|
|
2016-09-17 16:47:34 +00:00
|
|
|
fs = ITOFS(ip);
|
|
|
|
return (getblk(ITODEVVP(ip), fsbtodb(fs, ino_to_fsba(fs,
|
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.
Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block. The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.
In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it. After the write is finished, reread cg block and
update the cg_initediblk.
If inode block is already locked by another thread, let the another
thread initialize it. If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk. Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.
Sponsored by: The FreeBSD Foundation
In collaboration with: pho
Reviewed by: mckusick
MFC after: 1 month
X-MFC-note: after r246877
2013-02-27 07:31:23 +00:00
|
|
|
cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
|
|
|
|
gbflags));
|
|
|
|
}
|
|
|
|
|
2017-12-09 15:44:30 +00:00
|
|
|
/*
|
|
|
|
* Synchronous inode initialization is needed only when barrier writes do not
|
|
|
|
* work as advertised, and will impose a heavy cost on file creation in a newly
|
|
|
|
* created filesystem.
|
|
|
|
*/
|
|
|
|
static int doasyncinodeinit = 1;
|
|
|
|
SYSCTL_INT(_vfs_ffs, OID_AUTO, doasyncinodeinit, CTLFLAG_RWTUN,
|
|
|
|
&doasyncinodeinit, 0,
|
|
|
|
"Perform inode block initialization using asynchronous writes");
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Determine whether an inode can be allocated.
|
|
|
|
*
|
|
|
|
* Check to see if an inode is available, and if it is,
|
|
|
|
* allocate it using the following policy:
|
|
|
|
* 1) allocate the requested inode.
|
|
|
|
* 2) allocate the next available inode after the requested
|
|
|
|
* inode in the specified cylinder group.
|
|
|
|
*/
|
2002-06-22 21:24:58 +00:00
|
|
|
static ufs2_daddr_t
|
2010-04-24 07:05:35 +00:00
|
|
|
ffs_nodealloccg(ip, cg, ipref, mode, unused)
|
1994-05-24 10:09:53 +00:00
|
|
|
struct inode *ip;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int cg;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t ipref;
|
1994-05-24 10:09:53 +00:00
|
|
|
int mode;
|
2010-04-24 07:05:35 +00:00
|
|
|
int unused;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-05-13 09:22:31 +00:00
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
2002-06-21 06:18:05 +00:00
|
|
|
struct buf *bp, *ibp;
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
2012-01-01 20:47:33 +00:00
|
|
|
u_int8_t *inosused, *loc;
|
2002-06-21 06:18:05 +00:00
|
|
|
struct ufs2_dinode *dp2;
|
2012-01-01 20:47:33 +00:00
|
|
|
int error, start, len, i;
|
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.
Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block. The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.
In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it. After the write is finished, reread cg block and
update the cg_initediblk.
If inode block is already locked by another thread, let the another
thread initialize it. If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk. Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.
Sponsored by: The FreeBSD Foundation
In collaboration with: pho
Reviewed by: mckusick
MFC after: 1 month
X-MFC-note: after r246877
2013-02-27 07:31:23 +00:00
|
|
|
u_int32_t old_initediblk;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = ITOUMP(ip);
|
|
|
|
fs = ump->um_fs;
|
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.
Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block. The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.
In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it. After the write is finished, reread cg block and
update the cg_initediblk.
If inode block is already locked by another thread, let the another
thread initialize it. If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk. Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.
Sponsored by: The FreeBSD Foundation
In collaboration with: pho
Reviewed by: mckusick
MFC after: 1 month
X-MFC-note: after r246877
2013-02-27 07:31:23 +00:00
|
|
|
check_nifree:
|
1994-05-24 10:09:53 +00:00
|
|
|
if (fs->fs_cs(fs, cg).cs_nifree == 0)
|
1996-07-12 04:12:25 +00:00
|
|
|
return (0);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
2017-06-28 17:32:09 +00:00
|
|
|
if ((error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp)) != 0) {
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
1996-07-12 04:12:25 +00:00
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.
Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block. The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.
In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it. After the write is finished, reread cg block and
update the cg_initediblk.
If inode block is already locked by another thread, let the another
thread initialize it. If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk. Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.
Sponsored by: The FreeBSD Foundation
In collaboration with: pho
Reviewed by: mckusick
MFC after: 1 month
X-MFC-note: after r246877
2013-02-27 07:31:23 +00:00
|
|
|
restart:
|
2017-06-28 17:32:09 +00:00
|
|
|
if (cgp->cg_cs.cs_nifree == 0) {
|
1994-05-24 10:09:53 +00:00
|
|
|
brelse(bp);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
1996-07-12 04:12:25 +00:00
|
|
|
return (0);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2000-03-15 07:08:36 +00:00
|
|
|
inosused = cg_inosused(cgp);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (ipref) {
|
|
|
|
ipref %= fs->fs_ipg;
|
2000-03-15 07:08:36 +00:00
|
|
|
if (isclr(inosused, ipref))
|
1994-05-24 10:09:53 +00:00
|
|
|
goto gotit;
|
|
|
|
}
|
|
|
|
start = cgp->cg_irotor / NBBY;
|
|
|
|
len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
|
2012-01-01 20:47:33 +00:00
|
|
|
loc = memcchr(&inosused[start], 0xff, len);
|
|
|
|
if (loc == NULL) {
|
1994-05-24 10:09:53 +00:00
|
|
|
len = start + 1;
|
|
|
|
start = 0;
|
2012-01-01 20:47:33 +00:00
|
|
|
loc = memcchr(&inosused[start], 0xff, len);
|
|
|
|
if (loc == NULL) {
|
1995-02-14 06:14:28 +00:00
|
|
|
printf("cg = %d, irotor = %ld, fs = %s\n",
|
1998-07-11 07:46:16 +00:00
|
|
|
cg, (long)cgp->cg_irotor, fs->fs_fsmnt);
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("ffs_nodealloccg: map corrupted");
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
|
|
|
}
|
2012-01-01 20:47:33 +00:00
|
|
|
ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1;
|
1994-05-24 10:09:53 +00:00
|
|
|
gotit:
|
2002-06-21 06:18:05 +00:00
|
|
|
/*
|
|
|
|
* Check to see if we need to initialize more inodes.
|
|
|
|
*/
|
|
|
|
if (fs->fs_magic == FS_UFS2_MAGIC &&
|
|
|
|
ipref + INOPB(fs) > cgp->cg_initediblk &&
|
|
|
|
cgp->cg_initediblk < cgp->cg_niblk) {
|
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.
Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block. The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.
In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it. After the write is finished, reread cg block and
update the cg_initediblk.
If inode block is already locked by another thread, let the another
thread initialize it. If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk. Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.
Sponsored by: The FreeBSD Foundation
In collaboration with: pho
Reviewed by: mckusick
MFC after: 1 month
X-MFC-note: after r246877
2013-02-27 07:31:23 +00:00
|
|
|
old_initediblk = cgp->cg_initediblk;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free the cylinder group lock before writing the
|
|
|
|
* initialized inode block. Entering the
|
|
|
|
* babarrierwrite() with the cylinder group lock
|
|
|
|
* causes lock order violation between the lock and
|
|
|
|
* snaplk.
|
|
|
|
*
|
|
|
|
* Another thread can decide to initialize the same
|
|
|
|
* inode block, but whichever thread first gets the
|
|
|
|
* cylinder group lock after writing the newly
|
|
|
|
* allocated inode block will update it and the other
|
|
|
|
* will realize that it has lost and leave the
|
|
|
|
* cylinder group unchanged.
|
|
|
|
*/
|
|
|
|
ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT);
|
|
|
|
brelse(bp);
|
|
|
|
if (ibp == NULL) {
|
|
|
|
/*
|
|
|
|
* The inode block buffer is already owned by
|
|
|
|
* another thread, which must initialize it.
|
|
|
|
* Wait on the buffer to allow another thread
|
|
|
|
* to finish the updates, with dropped cg
|
|
|
|
* buffer lock, then retry.
|
|
|
|
*/
|
|
|
|
ibp = getinobuf(ip, cg, old_initediblk, 0);
|
|
|
|
brelse(ibp);
|
|
|
|
UFS_LOCK(ump);
|
|
|
|
goto check_nifree;
|
|
|
|
}
|
2002-06-21 06:18:05 +00:00
|
|
|
bzero(ibp->b_data, (int)fs->fs_bsize);
|
|
|
|
dp2 = (struct ufs2_dinode *)(ibp->b_data);
|
|
|
|
for (i = 0; i < INOPB(fs); i++) {
|
2016-05-22 14:31:20 +00:00
|
|
|
while (dp2->di_gen == 0)
|
|
|
|
dp2->di_gen = arc4random();
|
2002-06-21 06:18:05 +00:00
|
|
|
dp2++;
|
|
|
|
}
|
2017-12-09 15:44:30 +00:00
|
|
|
|
2013-02-16 15:11:40 +00:00
|
|
|
/*
|
|
|
|
* Rather than adding a soft updates dependency to ensure
|
|
|
|
* that the new inode block is written before it is claimed
|
|
|
|
* by the cylinder group map, we just do a barrier write
|
|
|
|
* here. The barrier write will ensure that the inode block
|
|
|
|
* gets written before the updated cylinder group map can be
|
|
|
|
* written. The barrier write should only slow down bulk
|
|
|
|
* loading of newly created filesystems.
|
|
|
|
*/
|
2017-12-09 15:44:30 +00:00
|
|
|
if (doasyncinodeinit)
|
|
|
|
babarrierwrite(ibp);
|
|
|
|
else
|
|
|
|
bwrite(ibp);
|
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.
Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block. The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.
In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it. After the write is finished, reread cg block and
update the cg_initediblk.
If inode block is already locked by another thread, let the another
thread initialize it. If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk. Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.
Sponsored by: The FreeBSD Foundation
In collaboration with: pho
Reviewed by: mckusick
MFC after: 1 month
X-MFC-note: after r246877
2013-02-27 07:31:23 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* After the inode block is written, try to update the
|
|
|
|
* cg initediblk pointer. If another thread beat us
|
|
|
|
* to it, then leave it unchanged as the other thread
|
|
|
|
* has already set it correctly.
|
|
|
|
*/
|
2017-06-28 17:32:09 +00:00
|
|
|
error = ffs_getcg(fs, ump->um_devvp, cg, &bp, &cgp);
|
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.
Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block. The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.
In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it. After the write is finished, reread cg block and
update the cg_initediblk.
If inode block is already locked by another thread, let the another
thread initialize it. If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk. Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.
Sponsored by: The FreeBSD Foundation
In collaboration with: pho
Reviewed by: mckusick
MFC after: 1 month
X-MFC-note: after r246877
2013-02-27 07:31:23 +00:00
|
|
|
UFS_LOCK(ump);
|
|
|
|
ACTIVECLEAR(fs, cg);
|
|
|
|
UFS_UNLOCK(ump);
|
2017-06-28 17:32:09 +00:00
|
|
|
if (error != 0)
|
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.
Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block. The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.
In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it. After the write is finished, reread cg block and
update the cg_initediblk.
If inode block is already locked by another thread, let the another
thread initialize it. If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk. Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.
Sponsored by: The FreeBSD Foundation
In collaboration with: pho
Reviewed by: mckusick
MFC after: 1 month
X-MFC-note: after r246877
2013-02-27 07:31:23 +00:00
|
|
|
return (error);
|
|
|
|
if (cgp->cg_initediblk == old_initediblk)
|
|
|
|
cgp->cg_initediblk += INOPB(fs);
|
|
|
|
goto restart;
|
2002-06-21 06:18:05 +00:00
|
|
|
}
|
An inode block must not be blockingly read while cg block is owned.
The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
the order causes deadlocks.
Inode block must not be written while cg block buffer is owned. The
FFS copy on write needs to allocate a block to copy the content of the
inode block, and the cylinder group selected for the allocation might
be the same as the owned cg block. The reserved block detection code
in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
situation, because the locked cg buffer is not exposed to it.
In order to maintain the dependency between initialized inode block
and the cg_initediblk pointer, look up the inode buffer in
non-blocking mode. If succeeded, brelse cg block, initialize the inode
block and write it. After the write is finished, reread cg block and
update the cg_initediblk.
If inode block is already locked by another thread, let the another
thread initialize it. If another thread raced with us after we
started writing inode block, the situation is detected by an update of
cg_initediblk. Note that double-initialization of the inode block is
harmless, the block cannot be used until cg_initediblk is incremented.
Sponsored by: The FreeBSD Foundation
In collaboration with: pho
Reviewed by: mckusick
MFC after: 1 month
X-MFC-note: after r246877
2013-02-27 07:31:23 +00:00
|
|
|
cgp->cg_irotor = ipref;
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
|
|
|
ACTIVECLEAR(fs, cg);
|
|
|
|
setbit(inosused, ipref);
|
|
|
|
cgp->cg_cs.cs_nifree--;
|
|
|
|
fs->fs_cstotal.cs_nifree--;
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree--;
|
|
|
|
fs->fs_fmod = 1;
|
2018-03-17 12:59:55 +00:00
|
|
|
if ((mode & IFMT) == IFDIR) {
|
2005-01-24 10:08:35 +00:00
|
|
|
cgp->cg_cs.cs_ndir++;
|
|
|
|
fs->fs_cstotal.cs_ndir++;
|
|
|
|
fs->fs_cs(fs, cg).cs_ndir++;
|
|
|
|
}
|
|
|
|
UFS_UNLOCK(ump);
|
|
|
|
if (DOINGSOFTDEP(ITOV(ip)))
|
2011-06-20 03:25:09 +00:00
|
|
|
softdep_setup_inomapdep(bp, ip, cg * fs->fs_ipg + ipref, mode);
|
1994-05-24 10:09:53 +00:00
|
|
|
bdwrite(bp);
|
2010-02-10 20:10:35 +00:00
|
|
|
return ((ino_t)(cg * fs->fs_ipg + ipref));
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Free a block or fragment.
|
|
|
|
*
|
|
|
|
* The specified block or fragment is placed back in the
|
1995-05-30 08:16:23 +00:00
|
|
|
* free map. If a fragment is deallocated, a possible
|
1994-05-24 10:09:53 +00:00
|
|
|
* block reassembly is checked.
|
|
|
|
*/
|
2010-12-29 12:25:28 +00:00
|
|
|
static void
|
|
|
|
ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd)
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
2002-02-02 01:42:44 +00:00
|
|
|
struct fs *fs;
|
|
|
|
struct vnode *devvp;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t bno;
|
1994-05-24 10:09:53 +00:00
|
|
|
long size;
|
2002-02-02 01:42:44 +00:00
|
|
|
ino_t inum;
|
2010-04-24 07:05:35 +00:00
|
|
|
struct workhead *dephd;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2010-04-24 07:05:35 +00:00
|
|
|
struct mount *mp;
|
2002-02-02 01:42:44 +00:00
|
|
|
struct cg *cgp;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct buf *bp;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs1_daddr_t fragno, cgbno;
|
2017-06-28 17:32:09 +00:00
|
|
|
int i, blk, frags, bbase, error;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int cg;
|
2000-03-15 07:08:36 +00:00
|
|
|
u_int8_t *blksfree;
|
2004-06-16 09:47:26 +00:00
|
|
|
struct cdev *dev;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2002-02-02 01:42:44 +00:00
|
|
|
cg = dtog(fs, bno);
|
2009-02-06 17:14:07 +00:00
|
|
|
if (devvp->v_type == VREG) {
|
2002-02-02 01:42:44 +00:00
|
|
|
/* devvp is a snapshot */
|
2016-09-17 16:47:34 +00:00
|
|
|
MPASS(devvp->v_mount->mnt_data == ump);
|
|
|
|
dev = ump->um_devvp->v_rdev;
|
2016-09-19 15:58:33 +00:00
|
|
|
} else if (devvp->v_type == VCHR) {
|
2002-02-02 01:42:44 +00:00
|
|
|
/* devvp is a normal disk device */
|
|
|
|
dev = devvp->v_rdev;
|
2011-05-26 23:56:58 +00:00
|
|
|
ASSERT_VOP_LOCKED(devvp, "ffs_blkfree_cg");
|
2016-09-19 15:58:33 +00:00
|
|
|
} else
|
|
|
|
return;
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
1998-03-08 09:59:44 +00:00
|
|
|
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0 ||
|
|
|
|
fragnum(fs, bno) + numfrags(fs, size) > fs->fs_frag) {
|
2002-09-19 03:55:30 +00:00
|
|
|
printf("dev=%s, bno = %jd, bsize = %ld, size = %ld, fs = %s\n",
|
2002-06-21 06:18:05 +00:00
|
|
|
devtoname(dev), (intmax_t)bno, (long)fs->fs_bsize,
|
2002-02-02 01:42:44 +00:00
|
|
|
size, fs->fs_fsmnt);
|
2011-06-15 23:19:09 +00:00
|
|
|
panic("ffs_blkfree_cg: bad size");
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2000-07-11 22:07:57 +00:00
|
|
|
#endif
|
1994-05-24 10:09:53 +00:00
|
|
|
if ((u_int)bno >= fs->fs_size) {
|
2002-06-23 18:17:27 +00:00
|
|
|
printf("bad block %jd, ino %lu\n", (intmax_t)bno,
|
2002-06-21 06:18:05 +00:00
|
|
|
(u_long)inum);
|
2002-02-02 01:42:44 +00:00
|
|
|
ffs_fserr(fs, inum, "bad block");
|
1994-05-24 10:09:53 +00:00
|
|
|
return;
|
|
|
|
}
|
2017-06-28 17:32:09 +00:00
|
|
|
if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0)
|
1994-05-24 10:09:53 +00:00
|
|
|
return;
|
2001-03-21 04:01:02 +00:00
|
|
|
cgbno = dtogd(fs, bno);
|
2000-03-15 07:08:36 +00:00
|
|
|
blksfree = cg_blksfree(cgp);
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (size == fs->fs_bsize) {
|
2001-03-21 04:01:02 +00:00
|
|
|
fragno = fragstoblks(fs, cgbno);
|
|
|
|
if (!ffs_isfreeblock(fs, blksfree, fragno)) {
|
2009-02-06 17:14:07 +00:00
|
|
|
if (devvp->v_type == VREG) {
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_UNLOCK(ump);
|
2002-02-02 01:42:44 +00:00
|
|
|
/* devvp is a snapshot */
|
|
|
|
brelse(bp);
|
|
|
|
return;
|
|
|
|
}
|
2002-06-23 18:17:27 +00:00
|
|
|
printf("dev = %s, block = %jd, fs = %s\n",
|
2002-06-21 06:18:05 +00:00
|
|
|
devtoname(dev), (intmax_t)bno, fs->fs_fsmnt);
|
2011-06-15 23:19:09 +00:00
|
|
|
panic("ffs_blkfree_cg: freeing free block");
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2001-03-21 04:01:02 +00:00
|
|
|
ffs_setblock(fs, blksfree, fragno);
|
2010-04-24 07:05:35 +00:00
|
|
|
ffs_clusteracct(fs, cgp, fragno, 1);
|
1994-05-24 10:09:53 +00:00
|
|
|
cgp->cg_cs.cs_nbfree++;
|
|
|
|
fs->fs_cstotal.cs_nbfree++;
|
|
|
|
fs->fs_cs(fs, cg).cs_nbfree++;
|
|
|
|
} else {
|
2001-03-21 04:01:02 +00:00
|
|
|
bbase = cgbno - fragnum(fs, cgbno);
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* decrement the counts associated with the old frags
|
|
|
|
*/
|
2000-03-15 07:08:36 +00:00
|
|
|
blk = blkmap(fs, blksfree, bbase);
|
1994-05-24 10:09:53 +00:00
|
|
|
ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
|
|
|
|
/*
|
|
|
|
* deallocate the fragment
|
|
|
|
*/
|
|
|
|
frags = numfrags(fs, size);
|
|
|
|
for (i = 0; i < frags; i++) {
|
2001-03-21 04:01:02 +00:00
|
|
|
if (isset(blksfree, cgbno + i)) {
|
2002-06-23 18:17:27 +00:00
|
|
|
printf("dev = %s, block = %jd, fs = %s\n",
|
2002-06-21 06:18:05 +00:00
|
|
|
devtoname(dev), (intmax_t)(bno + i),
|
1998-07-11 07:46:16 +00:00
|
|
|
fs->fs_fsmnt);
|
2011-06-15 23:19:09 +00:00
|
|
|
panic("ffs_blkfree_cg: freeing free frag");
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2001-03-21 04:01:02 +00:00
|
|
|
setbit(blksfree, cgbno + i);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
|
|
|
cgp->cg_cs.cs_nffree += i;
|
|
|
|
fs->fs_cstotal.cs_nffree += i;
|
|
|
|
fs->fs_cs(fs, cg).cs_nffree += i;
|
|
|
|
/*
|
|
|
|
* add back in counts associated with the new frags
|
|
|
|
*/
|
2000-03-15 07:08:36 +00:00
|
|
|
blk = blkmap(fs, blksfree, bbase);
|
1994-05-24 10:09:53 +00:00
|
|
|
ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
|
|
|
|
/*
|
|
|
|
* if a complete block has been reassembled, account for it
|
|
|
|
*/
|
2001-03-21 04:01:02 +00:00
|
|
|
fragno = fragstoblks(fs, bbase);
|
|
|
|
if (ffs_isblock(fs, blksfree, fragno)) {
|
1994-05-24 10:09:53 +00:00
|
|
|
cgp->cg_cs.cs_nffree -= fs->fs_frag;
|
|
|
|
fs->fs_cstotal.cs_nffree -= fs->fs_frag;
|
|
|
|
fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
|
2010-04-24 07:05:35 +00:00
|
|
|
ffs_clusteracct(fs, cgp, fragno, 1);
|
1994-05-24 10:09:53 +00:00
|
|
|
cgp->cg_cs.cs_nbfree++;
|
|
|
|
fs->fs_cstotal.cs_nbfree++;
|
|
|
|
fs->fs_cs(fs, cg).cs_nbfree++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fs->fs_fmod = 1;
|
2005-01-24 10:08:35 +00:00
|
|
|
ACTIVECLEAR(fs, cg);
|
|
|
|
UFS_UNLOCK(ump);
|
2010-04-24 07:05:35 +00:00
|
|
|
mp = UFSTOVFS(ump);
|
2016-09-19 15:58:33 +00:00
|
|
|
if (MOUNTEDSOFTDEP(mp) && devvp->v_type == VCHR)
|
2010-04-24 07:05:35 +00:00
|
|
|
softdep_setup_blkfree(UFSTOVFS(ump), bp, bno,
|
|
|
|
numfrags(fs, size), dephd);
|
1994-05-24 10:09:53 +00:00
|
|
|
bdwrite(bp);
|
|
|
|
}
|
|
|
|
|
2010-12-29 12:25:28 +00:00
|
|
|
struct ffs_blkfree_trim_params {
|
|
|
|
struct task task;
|
|
|
|
struct ufsmount *ump;
|
|
|
|
struct vnode *devvp;
|
|
|
|
ufs2_daddr_t bno;
|
|
|
|
long size;
|
|
|
|
ino_t inum;
|
|
|
|
struct workhead *pdephd;
|
|
|
|
struct workhead dephd;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void
|
|
|
|
ffs_blkfree_trim_task(ctx, pending)
|
|
|
|
void *ctx;
|
|
|
|
int pending;
|
|
|
|
{
|
|
|
|
struct ffs_blkfree_trim_params *tp;
|
|
|
|
|
|
|
|
tp = ctx;
|
|
|
|
ffs_blkfree_cg(tp->ump, tp->ump->um_fs, tp->devvp, tp->bno, tp->size,
|
|
|
|
tp->inum, tp->pdephd);
|
|
|
|
vn_finished_secondary_write(UFSTOVFS(tp->ump));
|
2016-03-27 08:21:17 +00:00
|
|
|
atomic_add_int(&tp->ump->um_trim_inflight, -1);
|
2010-12-29 12:25:28 +00:00
|
|
|
free(tp, M_TEMP);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
ffs_blkfree_trim_completed(bip)
|
|
|
|
struct bio *bip;
|
|
|
|
{
|
|
|
|
struct ffs_blkfree_trim_params *tp;
|
|
|
|
|
|
|
|
tp = bip->bio_caller2;
|
|
|
|
g_destroy_bio(bip);
|
|
|
|
TASK_INIT(&tp->task, 0, ffs_blkfree_trim_task, tp);
|
2016-03-27 08:21:17 +00:00
|
|
|
taskqueue_enqueue(tp->ump->um_trim_tq, &tp->task);
|
2010-12-29 12:25:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
2011-06-15 23:19:09 +00:00
|
|
|
ffs_blkfree(ump, fs, devvp, bno, size, inum, vtype, dephd)
|
2010-12-29 12:25:28 +00:00
|
|
|
struct ufsmount *ump;
|
|
|
|
struct fs *fs;
|
|
|
|
struct vnode *devvp;
|
|
|
|
ufs2_daddr_t bno;
|
|
|
|
long size;
|
|
|
|
ino_t inum;
|
2011-06-15 23:19:09 +00:00
|
|
|
enum vtype vtype;
|
2010-12-29 12:25:28 +00:00
|
|
|
struct workhead *dephd;
|
|
|
|
{
|
|
|
|
struct mount *mp;
|
|
|
|
struct bio *bip;
|
|
|
|
struct ffs_blkfree_trim_params *tp;
|
|
|
|
|
2011-05-26 23:56:58 +00:00
|
|
|
/*
|
|
|
|
* Check to see if a snapshot wants to claim the block.
|
|
|
|
* Check that devvp is a normal disk device, not a snapshot,
|
|
|
|
* it has a snapshot(s) associated with it, and one of the
|
|
|
|
* snapshots wants to claim the block.
|
|
|
|
*/
|
2016-09-19 15:58:33 +00:00
|
|
|
if (devvp->v_type == VCHR &&
|
2011-05-26 23:56:58 +00:00
|
|
|
(devvp->v_vflag & VV_COPYONWRITE) &&
|
2011-06-15 23:19:09 +00:00
|
|
|
ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, dephd)) {
|
2011-05-26 23:56:58 +00:00
|
|
|
return;
|
|
|
|
}
|
2011-07-10 05:34:49 +00:00
|
|
|
/*
|
|
|
|
* Nothing to delay if TRIM is disabled, or the operation is
|
|
|
|
* performed on the snapshot.
|
|
|
|
*/
|
|
|
|
if (!ump->um_candelete || devvp->v_type == VREG) {
|
2010-12-29 12:25:28 +00:00
|
|
|
ffs_blkfree_cg(ump, fs, devvp, bno, size, inum, dephd);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Postpone the set of the free bit in the cg bitmap until the
|
|
|
|
* BIO_DELETE is completed. Otherwise, due to disk queue
|
|
|
|
* reordering, TRIM might be issued after we reuse the block
|
|
|
|
* and write some new data into it.
|
|
|
|
*/
|
2016-03-27 08:21:17 +00:00
|
|
|
atomic_add_int(&ump->um_trim_inflight, 1);
|
2010-12-29 12:25:28 +00:00
|
|
|
tp = malloc(sizeof(struct ffs_blkfree_trim_params), M_TEMP, M_WAITOK);
|
|
|
|
tp->ump = ump;
|
|
|
|
tp->devvp = devvp;
|
|
|
|
tp->bno = bno;
|
|
|
|
tp->size = size;
|
|
|
|
tp->inum = inum;
|
|
|
|
if (dephd != NULL) {
|
|
|
|
LIST_INIT(&tp->dephd);
|
|
|
|
LIST_SWAP(dephd, &tp->dephd, worklist, wk_list);
|
|
|
|
tp->pdephd = &tp->dephd;
|
|
|
|
} else
|
|
|
|
tp->pdephd = NULL;
|
|
|
|
|
|
|
|
bip = g_alloc_bio();
|
|
|
|
bip->bio_cmd = BIO_DELETE;
|
|
|
|
bip->bio_offset = dbtob(fsbtodb(fs, bno));
|
|
|
|
bip->bio_done = ffs_blkfree_trim_completed;
|
|
|
|
bip->bio_length = size;
|
|
|
|
bip->bio_caller2 = tp;
|
|
|
|
|
|
|
|
mp = UFSTOVFS(ump);
|
|
|
|
vn_start_secondary_write(NULL, &mp, 0);
|
|
|
|
g_io_request(bip, (struct g_consumer *)devvp->v_bufobj.bo_private);
|
|
|
|
}
|
|
|
|
|
2007-11-08 17:21:51 +00:00
|
|
|
#ifdef INVARIANTS
|
1997-02-10 02:22:35 +00:00
|
|
|
/*
|
|
|
|
* Verify allocation of a block or fragment. Returns true if block or
|
|
|
|
* fragment is allocated, false if it is free.
|
|
|
|
*/
|
1997-11-22 08:35:46 +00:00
|
|
|
static int
|
1997-02-10 02:22:35 +00:00
|
|
|
ffs_checkblk(ip, bno, size)
|
|
|
|
struct inode *ip;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t bno;
|
1997-02-10 02:22:35 +00:00
|
|
|
long size;
|
|
|
|
{
|
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
|
|
|
struct buf *bp;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs1_daddr_t cgbno;
|
1997-02-10 02:22:35 +00:00
|
|
|
int i, error, frags, free;
|
2000-03-15 07:08:36 +00:00
|
|
|
u_int8_t *blksfree;
|
1997-02-10 02:22:35 +00:00
|
|
|
|
2016-09-17 16:47:34 +00:00
|
|
|
fs = ITOFS(ip);
|
1997-02-10 02:22:35 +00:00
|
|
|
if ((u_int)size > fs->fs_bsize || fragoff(fs, size) != 0) {
|
1998-07-11 07:46:16 +00:00
|
|
|
printf("bsize = %ld, size = %ld, fs = %s\n",
|
|
|
|
(long)fs->fs_bsize, size, fs->fs_fsmnt);
|
1997-02-10 17:05:30 +00:00
|
|
|
panic("ffs_checkblk: bad size");
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
|
|
|
if ((u_int)bno >= fs->fs_size)
|
2002-09-19 03:55:30 +00:00
|
|
|
panic("ffs_checkblk: bad block %jd", (intmax_t)bno);
|
2017-06-28 17:32:09 +00:00
|
|
|
error = ffs_getcg(fs, ITODEVVP(ip), dtog(fs, bno), &bp, &cgp);
|
1997-02-10 17:05:30 +00:00
|
|
|
if (error)
|
2017-06-28 17:32:09 +00:00
|
|
|
panic("ffs_checkblk: cylinder group read failed");
|
2000-03-15 07:08:36 +00:00
|
|
|
blksfree = cg_blksfree(cgp);
|
2002-06-21 06:18:05 +00:00
|
|
|
cgbno = dtogd(fs, bno);
|
1997-02-10 02:22:35 +00:00
|
|
|
if (size == fs->fs_bsize) {
|
2002-06-21 06:18:05 +00:00
|
|
|
free = ffs_isblock(fs, blksfree, fragstoblks(fs, cgbno));
|
1997-02-10 02:22:35 +00:00
|
|
|
} else {
|
|
|
|
frags = numfrags(fs, size);
|
|
|
|
for (free = 0, i = 0; i < frags; i++)
|
2002-06-21 06:18:05 +00:00
|
|
|
if (isset(blksfree, cgbno + i))
|
1997-02-10 02:22:35 +00:00
|
|
|
free++;
|
|
|
|
if (free != 0 && free != frags)
|
1997-02-10 17:05:30 +00:00
|
|
|
panic("ffs_checkblk: partially free fragment");
|
1997-02-10 02:22:35 +00:00
|
|
|
}
|
|
|
|
brelse(bp);
|
|
|
|
return (!free);
|
|
|
|
}
|
2007-11-08 17:21:51 +00:00
|
|
|
#endif /* INVARIANTS */
|
1997-02-10 02:22:35 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Free an inode.
|
|
|
|
*/
|
|
|
|
int
|
2001-03-21 04:09:01 +00:00
|
|
|
ffs_vfree(pvp, ino, mode)
|
1998-03-08 09:59:44 +00:00
|
|
|
struct vnode *pvp;
|
|
|
|
ino_t ino;
|
|
|
|
int mode;
|
|
|
|
{
|
2016-09-17 16:47:34 +00:00
|
|
|
struct ufsmount *ump;
|
2005-01-24 10:08:35 +00:00
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
if (DOINGSOFTDEP(pvp)) {
|
|
|
|
softdep_freefile(pvp, ino, mode);
|
|
|
|
return (0);
|
|
|
|
}
|
2016-09-17 16:47:34 +00:00
|
|
|
ump = VFSTOUFS(pvp->v_mount);
|
|
|
|
return (ffs_freefile(ump, ump->um_fs, ump->um_devvp, ino, mode, NULL));
|
1998-03-08 09:59:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Do the actual free operation.
|
|
|
|
* The specified inode is placed back in the free map.
|
|
|
|
*/
|
2001-03-21 04:09:01 +00:00
|
|
|
int
|
2010-04-24 07:05:35 +00:00
|
|
|
ffs_freefile(ump, fs, devvp, ino, mode, wkhd)
|
2005-01-24 10:08:35 +00:00
|
|
|
struct ufsmount *ump;
|
2002-02-02 01:42:44 +00:00
|
|
|
struct fs *fs;
|
|
|
|
struct vnode *devvp;
|
VFS mega cleanup commit (x/N)
1. Add new file "sys/kern/vfs_default.c" where default actions for
VOPs go. Implement proper defaults for ABORTOP, BWRITE, LEASE,
POLL, REVOKE and STRATEGY. Various stuff spread over the entire
tree belongs here.
2. Change VOP_BLKATOFF to a normal function in cd9660.
3. Kill VOP_BLKATOFF, VOP_TRUNCATE, VOP_VFREE, VOP_VALLOC. These
are private interface functions between UFS and the underlying
storage manager layer (FFS/LFS/MFS/EXT2FS). The functions now
live in struct ufsmount instead.
4. Remove a kludge of VOP_ functions in all filesystems, that did
nothing but obscure the simplicity and break the expandability.
If a filesystem doesn't implement VOP_FOO, it shouldn't have an
entry for it in its vnops table. The system will try to DTRT
if it is not implemented. There are still some cruft left, but
the bulk of it is done.
5. Fix another VCALL in vfs_cache.c (thanks Bruce!)
1997-10-16 10:50:27 +00:00
|
|
|
ino_t ino;
|
|
|
|
int mode;
|
2010-04-24 07:05:35 +00:00
|
|
|
struct workhead *wkhd;
|
1994-05-24 10:09:53 +00:00
|
|
|
{
|
2002-02-02 01:42:44 +00:00
|
|
|
struct cg *cgp;
|
1994-05-24 10:09:53 +00:00
|
|
|
struct buf *bp;
|
2010-02-10 20:10:35 +00:00
|
|
|
int error;
|
|
|
|
u_int cg;
|
2000-03-15 07:08:36 +00:00
|
|
|
u_int8_t *inosused;
|
2004-06-16 09:47:26 +00:00
|
|
|
struct cdev *dev;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
cg = ino_to_cg(fs, ino);
|
2009-02-06 17:14:07 +00:00
|
|
|
if (devvp->v_type == VREG) {
|
2002-02-02 01:42:44 +00:00
|
|
|
/* devvp is a snapshot */
|
2016-09-17 16:47:34 +00:00
|
|
|
MPASS(devvp->v_mount->mnt_data == ump);
|
|
|
|
dev = ump->um_devvp->v_rdev;
|
2016-09-19 15:58:33 +00:00
|
|
|
} else if (devvp->v_type == VCHR) {
|
2002-02-02 01:42:44 +00:00
|
|
|
/* devvp is a normal disk device */
|
|
|
|
dev = devvp->v_rdev;
|
2016-09-19 15:58:33 +00:00
|
|
|
} else {
|
|
|
|
bp = NULL;
|
|
|
|
return (0);
|
2002-02-02 01:42:44 +00:00
|
|
|
}
|
2010-02-10 20:10:35 +00:00
|
|
|
if (ino >= fs->fs_ipg * fs->fs_ncg)
|
2012-09-27 23:30:49 +00:00
|
|
|
panic("ffs_freefile: range: dev = %s, ino = %ju, fs = %s",
|
|
|
|
devtoname(dev), (uintmax_t)ino, fs->fs_fsmnt);
|
2017-06-28 17:32:09 +00:00
|
|
|
if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0)
|
1998-03-08 09:59:44 +00:00
|
|
|
return (error);
|
2000-03-15 07:08:36 +00:00
|
|
|
inosused = cg_inosused(cgp);
|
1994-05-24 10:09:53 +00:00
|
|
|
ino %= fs->fs_ipg;
|
2000-03-15 07:08:36 +00:00
|
|
|
if (isclr(inosused, ino)) {
|
2012-09-27 23:30:49 +00:00
|
|
|
printf("dev = %s, ino = %ju, fs = %s\n", devtoname(dev),
|
|
|
|
(uintmax_t)(ino + cg * fs->fs_ipg), fs->fs_fsmnt);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (fs->fs_ronly == 0)
|
2002-12-18 00:53:45 +00:00
|
|
|
panic("ffs_freefile: freeing free inode");
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2000-03-15 07:08:36 +00:00
|
|
|
clrbit(inosused, ino);
|
1994-05-24 10:09:53 +00:00
|
|
|
if (ino < cgp->cg_irotor)
|
|
|
|
cgp->cg_irotor = ino;
|
|
|
|
cgp->cg_cs.cs_nifree++;
|
2005-01-24 10:08:35 +00:00
|
|
|
UFS_LOCK(ump);
|
1994-05-24 10:09:53 +00:00
|
|
|
fs->fs_cstotal.cs_nifree++;
|
|
|
|
fs->fs_cs(fs, cg).cs_nifree++;
|
2018-03-17 12:59:55 +00:00
|
|
|
if ((mode & IFMT) == IFDIR) {
|
1994-05-24 10:09:53 +00:00
|
|
|
cgp->cg_cs.cs_ndir--;
|
|
|
|
fs->fs_cstotal.cs_ndir--;
|
|
|
|
fs->fs_cs(fs, cg).cs_ndir--;
|
|
|
|
}
|
|
|
|
fs->fs_fmod = 1;
|
2005-01-24 10:08:35 +00:00
|
|
|
ACTIVECLEAR(fs, cg);
|
|
|
|
UFS_UNLOCK(ump);
|
2016-09-19 15:58:33 +00:00
|
|
|
if (MOUNTEDSOFTDEP(UFSTOVFS(ump)) && devvp->v_type == VCHR)
|
2010-04-24 07:05:35 +00:00
|
|
|
softdep_setup_inofree(UFSTOVFS(ump), bp,
|
|
|
|
ino + cg * fs->fs_ipg, wkhd);
|
1994-05-24 10:09:53 +00:00
|
|
|
bdwrite(bp);
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2003-02-22 00:29:51 +00:00
|
|
|
/*
|
|
|
|
* Check to see if a file is free.
|
2017-06-28 17:32:09 +00:00
|
|
|
* Used to check for allocated files in snapshots.
|
2003-02-22 00:29:51 +00:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
ffs_checkfreefile(fs, devvp, ino)
|
|
|
|
struct fs *fs;
|
|
|
|
struct vnode *devvp;
|
|
|
|
ino_t ino;
|
|
|
|
{
|
|
|
|
struct cg *cgp;
|
|
|
|
struct buf *bp;
|
2017-06-28 17:32:09 +00:00
|
|
|
int ret, error;
|
2010-02-10 20:10:35 +00:00
|
|
|
u_int cg;
|
2003-02-22 00:29:51 +00:00
|
|
|
u_int8_t *inosused;
|
|
|
|
|
|
|
|
cg = ino_to_cg(fs, ino);
|
2018-05-19 19:30:42 +00:00
|
|
|
if ((devvp->v_type != VREG) && (devvp->v_type != VCHR))
|
2016-09-19 15:58:33 +00:00
|
|
|
return (1);
|
2010-02-10 20:10:35 +00:00
|
|
|
if (ino >= fs->fs_ipg * fs->fs_ncg)
|
2003-02-22 00:29:51 +00:00
|
|
|
return (1);
|
2017-06-28 17:32:09 +00:00
|
|
|
if ((error = ffs_getcg(fs, devvp, cg, &bp, &cgp)) != 0)
|
2003-02-22 00:29:51 +00:00
|
|
|
return (1);
|
|
|
|
inosused = cg_inosused(cgp);
|
|
|
|
ino %= fs->fs_ipg;
|
|
|
|
ret = isclr(inosused, ino);
|
|
|
|
brelse(bp);
|
|
|
|
return (ret);
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Find a block of the specified size in the specified cylinder group.
|
|
|
|
*
|
|
|
|
* It is a panic if a request is made to find a block if none are
|
|
|
|
* available.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
static ufs1_daddr_t
|
1994-05-24 10:09:53 +00:00
|
|
|
ffs_mapsearch(fs, cgp, bpref, allocsiz)
|
2002-05-13 09:22:31 +00:00
|
|
|
struct fs *fs;
|
|
|
|
struct cg *cgp;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t bpref;
|
1994-05-24 10:09:53 +00:00
|
|
|
int allocsiz;
|
|
|
|
{
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs1_daddr_t bno;
|
1994-05-24 10:09:53 +00:00
|
|
|
int start, len, loc, i;
|
|
|
|
int blk, field, subfield, pos;
|
2000-03-15 07:08:36 +00:00
|
|
|
u_int8_t *blksfree;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* find the fragment by searching through the free block
|
|
|
|
* map for an appropriate bit pattern
|
|
|
|
*/
|
|
|
|
if (bpref)
|
|
|
|
start = dtogd(fs, bpref) / NBBY;
|
|
|
|
else
|
|
|
|
start = cgp->cg_frotor / NBBY;
|
2000-03-15 07:08:36 +00:00
|
|
|
blksfree = cg_blksfree(cgp);
|
1994-05-24 10:09:53 +00:00
|
|
|
len = howmany(fs->fs_fpg, NBBY) - start;
|
2000-03-15 07:08:36 +00:00
|
|
|
loc = scanc((u_int)len, (u_char *)&blksfree[start],
|
2006-07-18 07:03:43 +00:00
|
|
|
fragtbl[fs->fs_frag],
|
1994-05-24 10:09:53 +00:00
|
|
|
(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
|
|
|
|
if (loc == 0) {
|
|
|
|
len = start + 1;
|
|
|
|
start = 0;
|
2000-03-15 07:08:36 +00:00
|
|
|
loc = scanc((u_int)len, (u_char *)&blksfree[0],
|
2006-07-18 07:03:43 +00:00
|
|
|
fragtbl[fs->fs_frag],
|
1994-05-24 10:09:53 +00:00
|
|
|
(u_char)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
|
|
|
|
if (loc == 0) {
|
|
|
|
printf("start = %d, len = %d, fs = %s\n",
|
|
|
|
start, len, fs->fs_fsmnt);
|
|
|
|
panic("ffs_alloccg: map corrupted");
|
|
|
|
/* NOTREACHED */
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bno = (start + len - loc) * NBBY;
|
|
|
|
cgp->cg_frotor = bno;
|
|
|
|
/*
|
|
|
|
* found the byte in the map
|
|
|
|
* sift through the bits to find the selected frag
|
|
|
|
*/
|
|
|
|
for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
|
2000-03-15 07:08:36 +00:00
|
|
|
blk = blkmap(fs, blksfree, bno);
|
1994-05-24 10:09:53 +00:00
|
|
|
blk <<= 1;
|
|
|
|
field = around[allocsiz];
|
|
|
|
subfield = inside[allocsiz];
|
|
|
|
for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
|
|
|
|
if ((blk & field) == subfield)
|
|
|
|
return (bno + pos);
|
|
|
|
field <<= 1;
|
|
|
|
subfield <<= 1;
|
|
|
|
}
|
|
|
|
}
|
1994-10-10 01:04:55 +00:00
|
|
|
printf("bno = %lu, fs = %s\n", (u_long)bno, fs->fs_fsmnt);
|
1994-05-24 10:09:53 +00:00
|
|
|
panic("ffs_alloccg: block not in map");
|
|
|
|
return (-1);
|
|
|
|
}
|
|
|
|
|
2017-11-05 13:28:48 +00:00
|
|
|
static const struct statfs *
|
|
|
|
ffs_getmntstat(struct vnode *devvp)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (devvp->v_type == VCHR)
|
|
|
|
return (&devvp->v_rdev->si_mountpt->mnt_stat);
|
|
|
|
return (ffs_getmntstat(VFSTOUFS(devvp->v_mount)->um_devvp));
|
|
|
|
}
|
|
|
|
|
2017-06-28 17:32:09 +00:00
|
|
|
/*
|
|
|
|
* Fetch and verify a cylinder group.
|
|
|
|
*/
|
|
|
|
int
|
|
|
|
ffs_getcg(fs, devvp, cg, bpp, cgpp)
|
|
|
|
struct fs *fs;
|
|
|
|
struct vnode *devvp;
|
|
|
|
u_int cg;
|
|
|
|
struct buf **bpp;
|
|
|
|
struct cg **cgpp;
|
|
|
|
{
|
|
|
|
struct buf *bp;
|
|
|
|
struct cg *cgp;
|
2017-11-05 13:28:48 +00:00
|
|
|
const struct statfs *sfs;
|
2017-09-22 12:45:15 +00:00
|
|
|
int flags, error;
|
2017-06-28 17:32:09 +00:00
|
|
|
|
|
|
|
*bpp = NULL;
|
|
|
|
*cgpp = NULL;
|
2017-09-22 12:45:15 +00:00
|
|
|
flags = 0;
|
|
|
|
if ((fs->fs_metackhash & CK_CYLGRP) != 0)
|
|
|
|
flags |= GB_CKHASH;
|
|
|
|
error = breadn_flags(devvp, devvp->v_type == VREG ?
|
2017-07-16 07:11:29 +00:00
|
|
|
fragstoblks(fs, cgtod(fs, cg)) : fsbtodb(fs, cgtod(fs, cg)),
|
2017-09-22 12:45:15 +00:00
|
|
|
(int)fs->fs_cgsize, NULL, NULL, 0, NOCRED, flags,
|
|
|
|
ffs_ckhash_cg, &bp);
|
2017-06-28 17:32:09 +00:00
|
|
|
if (error != 0)
|
|
|
|
return (error);
|
|
|
|
cgp = (struct cg *)bp->b_data;
|
2018-01-31 23:13:37 +00:00
|
|
|
if ((fs->fs_metackhash & CK_CYLGRP) != 0 &&
|
2017-09-22 12:45:15 +00:00
|
|
|
(bp->b_flags & B_CKHASH) != 0 &&
|
2018-01-31 23:13:37 +00:00
|
|
|
cgp->cg_ckhash != bp->b_ckhash) {
|
2017-11-05 13:28:48 +00:00
|
|
|
sfs = ffs_getmntstat(devvp);
|
|
|
|
printf("UFS %s%s (%s) cylinder checksum failed: cg %u, cgp: "
|
|
|
|
"0x%x != bp: 0x%jx\n",
|
|
|
|
devvp->v_type == VCHR ? "" : "snapshot of ",
|
|
|
|
sfs->f_mntfromname, sfs->f_mntonname,
|
2017-09-22 16:42:41 +00:00
|
|
|
cg, cgp->cg_ckhash, (uintmax_t)bp->b_ckhash);
|
2017-09-22 12:45:15 +00:00
|
|
|
bp->b_flags &= ~B_CKHASH;
|
|
|
|
bp->b_flags |= B_INVAL | B_NOCACHE;
|
2017-06-28 17:32:09 +00:00
|
|
|
brelse(bp);
|
|
|
|
return (EIO);
|
|
|
|
}
|
2018-01-31 23:13:37 +00:00
|
|
|
if (!cg_chkmagic(cgp) || cgp->cg_cgx != cg) {
|
|
|
|
sfs = ffs_getmntstat(devvp);
|
|
|
|
printf("UFS %s%s (%s)",
|
|
|
|
devvp->v_type == VCHR ? "" : "snapshot of ",
|
|
|
|
sfs->f_mntfromname, sfs->f_mntonname);
|
|
|
|
if (!cg_chkmagic(cgp))
|
|
|
|
printf(" cg %u: bad magic number 0x%x should be 0x%x\n",
|
|
|
|
cg, cgp->cg_magic, CG_MAGIC);
|
|
|
|
else
|
|
|
|
printf(": wrong cylinder group cg %u != cgx %u\n", cg,
|
|
|
|
cgp->cg_cgx);
|
|
|
|
bp->b_flags &= ~B_CKHASH;
|
|
|
|
bp->b_flags |= B_INVAL | B_NOCACHE;
|
|
|
|
brelse(bp);
|
|
|
|
return (EIO);
|
|
|
|
}
|
2017-09-22 12:45:15 +00:00
|
|
|
bp->b_flags &= ~B_CKHASH;
|
2017-06-28 17:32:09 +00:00
|
|
|
bp->b_xflags |= BX_BKGRDWRITE;
|
Occasional cylinder-group check-hash errors were being reported on
systems running with a heavy filesystem load. Tracking down this
bug was elusive because there were actually two problems. Sometimes
the in-memory check hash was wrong and sometimes the check hash
computed when doing the read was wrong. The occurrence of either
error caused a check-hash mismatch to be reported.
The first error was that the check hash in the in-memory cylinder
group was incorrect. This error was caused by the following
sequence of events:
- We read a cylinder-group buffer and the check hash is valid.
- We update its cg_time and cg_old_time which makes the in-memory
check-hash value invalid but we do not mark the cylinder group dirty.
- We do not make any other changes to the cylinder group, so we
never mark it dirty, thus do not write it out, and hence never
update the incorrect check hash for the in-memory buffer.
- Later, the buffer gets freed, but the page with the old incorrect
check hash is still in the VM cache.
- Later, we read the cylinder group again, and the first page with
the old check hash is still in the VM cache, but some other pages
are not, so we have to do a read.
- The read does not actually get the first page from disk, but rather
from the VM cache, resulting in the old check hash in the buffer.
- The value computed after doing the read does not match causing the
error to be printed.
The fix for this problem is to only set cg_time and cg_old_time as
the cylinder group is being written to disk. This keeps the in-memory
check-hash valid unless the cylinder group has had other modifications
which will require it to be written with a new check hash calculated.
It also requires that the check hash be recalculated in the in-memory
cylinder group when it is marked clean after doing a background write.
The second problem was that the check hash computed at the end of the
read was incorrect because the calculation of the check hash on
completion of the read was being done too soon.
- When a read completes we had the following sequence:
- bufdone()
-- b_ckhashcalc (calculates check hash)
-- bufdone_finish()
--- vfs_vmio_iodone() (replaces bogus pages with the cached ones)
- When we are reading a buffer where one or more pages are already
in memory (but not all pages, or we wouldn't be doing the read),
the I/O is done with bogus_page mapped in for the pages that exist
in the VM cache. This mapping is done to avoid corrupting the
cached pages if there is any I/O overrun. The vfs_vmio_iodone()
function is responsible for replacing the bogus_page(s) with the
cached ones. But we were calculating the check hash before the
bogus_page(s) were replaced. Hence, when we were calculating the
check hash, we were partly reading from bogus_page, which means
we calculated a bad check hash (e.g., because multiple pages have
been mapped to bogus_page, so its contents are indeterminate).
The second fix is to move the check-hash calculation from bufdone()
to bufdone_finish() after the call to vfs_vmio_iodone() so that it
computes the check hash over the correct set of pages.
With these two changes, the occasional cylinder-group check-hash
errors are gone.
Submitted by: David Pfitzner <dpfitzner@netflix.com>
Reviewed by: kib
Tested by: David Pfitzner
2018-02-06 00:19:46 +00:00
|
|
|
/*
|
|
|
|
* If we are using check hashes on the cylinder group then we want
|
|
|
|
* to limit changing the cylinder group time to when we are actually
|
|
|
|
* going to write it to disk so that its check hash remains correct
|
|
|
|
* in memory. If the CK_CYLGRP flag is set the time is updated in
|
|
|
|
* ffs_bufwrite() as the buffer is queued for writing. Otherwise we
|
|
|
|
* update the time here as we have done historically.
|
|
|
|
*/
|
2017-09-22 12:45:15 +00:00
|
|
|
if ((fs->fs_metackhash & CK_CYLGRP) != 0)
|
|
|
|
bp->b_xflags |= BX_CYLGRP;
|
Occasional cylinder-group check-hash errors were being reported on
systems running with a heavy filesystem load. Tracking down this
bug was elusive because there were actually two problems. Sometimes
the in-memory check hash was wrong and sometimes the check hash
computed when doing the read was wrong. The occurrence of either
error caused a check-hash mismatch to be reported.
The first error was that the check hash in the in-memory cylinder
group was incorrect. This error was caused by the following
sequence of events:
- We read a cylinder-group buffer and the check hash is valid.
- We update its cg_time and cg_old_time which makes the in-memory
check-hash value invalid but we do not mark the cylinder group dirty.
- We do not make any other changes to the cylinder group, so we
never mark it dirty, thus do not write it out, and hence never
update the incorrect check hash for the in-memory buffer.
- Later, the buffer gets freed, but the page with the old incorrect
check hash is still in the VM cache.
- Later, we read the cylinder group again, and the first page with
the old check hash is still in the VM cache, but some other pages
are not, so we have to do a read.
- The read does not actually get the first page from disk, but rather
from the VM cache, resulting in the old check hash in the buffer.
- The value computed after doing the read does not match causing the
error to be printed.
The fix for this problem is to only set cg_time and cg_old_time as
the cylinder group is being written to disk. This keeps the in-memory
check-hash valid unless the cylinder group has had other modifications
which will require it to be written with a new check hash calculated.
It also requires that the check hash be recalculated in the in-memory
cylinder group when it is marked clean after doing a background write.
The second problem was that the check hash computed at the end of the
read was incorrect because the calculation of the check hash on
completion of the read was being done too soon.
- When a read completes we had the following sequence:
- bufdone()
-- b_ckhashcalc (calculates check hash)
-- bufdone_finish()
--- vfs_vmio_iodone() (replaces bogus pages with the cached ones)
- When we are reading a buffer where one or more pages are already
in memory (but not all pages, or we wouldn't be doing the read),
the I/O is done with bogus_page mapped in for the pages that exist
in the VM cache. This mapping is done to avoid corrupting the
cached pages if there is any I/O overrun. The vfs_vmio_iodone()
function is responsible for replacing the bogus_page(s) with the
cached ones. But we were calculating the check hash before the
bogus_page(s) were replaced. Hence, when we were calculating the
check hash, we were partly reading from bogus_page, which means
we calculated a bad check hash (e.g., because multiple pages have
been mapped to bogus_page, so its contents are indeterminate).
The second fix is to move the check-hash calculation from bufdone()
to bufdone_finish() after the call to vfs_vmio_iodone() so that it
computes the check hash over the correct set of pages.
With these two changes, the occasional cylinder-group check-hash
errors are gone.
Submitted by: David Pfitzner <dpfitzner@netflix.com>
Reviewed by: kib
Tested by: David Pfitzner
2018-02-06 00:19:46 +00:00
|
|
|
else
|
|
|
|
cgp->cg_old_time = cgp->cg_time = time_second;
|
2017-06-28 17:32:09 +00:00
|
|
|
*bpp = bp;
|
|
|
|
*cgpp = cgp;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
2017-09-22 12:45:15 +00:00
|
|
|
static void
|
|
|
|
ffs_ckhash_cg(bp)
|
|
|
|
struct buf *bp;
|
|
|
|
{
|
|
|
|
uint32_t ckhash;
|
|
|
|
struct cg *cgp;
|
|
|
|
|
|
|
|
cgp = (struct cg *)bp->b_data;
|
|
|
|
ckhash = cgp->cg_ckhash;
|
|
|
|
cgp->cg_ckhash = 0;
|
|
|
|
bp->b_ckhash = calculate_crc32c(~0L, bp->b_data, bp->b_bcount);
|
|
|
|
cgp->cg_ckhash = ckhash;
|
|
|
|
}
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Fserr prints the name of a filesystem with an error diagnostic.
|
1995-05-30 08:16:23 +00:00
|
|
|
*
|
1994-05-24 10:09:53 +00:00
|
|
|
* The form of the error message is:
|
|
|
|
* fs: error message
|
|
|
|
*/
|
2011-06-15 18:05:08 +00:00
|
|
|
void
|
2002-02-02 01:42:44 +00:00
|
|
|
ffs_fserr(fs, inum, cp)
|
1994-05-24 10:09:53 +00:00
|
|
|
struct fs *fs;
|
2002-02-02 01:42:44 +00:00
|
|
|
ino_t inum;
|
1994-05-24 10:09:53 +00:00
|
|
|
char *cp;
|
|
|
|
{
|
2003-03-20 21:15:54 +00:00
|
|
|
struct thread *td = curthread; /* XXX */
|
|
|
|
struct proc *p = td->td_proc;
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2012-09-27 23:30:49 +00:00
|
|
|
log(LOG_ERR, "pid %d (%s), uid %d inumber %ju on %s: %s\n",
|
|
|
|
p->p_pid, p->p_comm, td->td_ucred->cr_uid, (uintmax_t)inum,
|
|
|
|
fs->fs_fsmnt, cp);
|
1994-05-24 10:09:53 +00:00
|
|
|
}
|
2001-03-21 04:09:01 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* This function provides the capability for the fsck program to
|
2010-01-11 20:44:05 +00:00
|
|
|
* update an active filesystem. Fourteen operations are provided:
|
2001-03-21 04:09:01 +00:00
|
|
|
*
|
|
|
|
* adjrefcnt(inode, amt) - adjusts the reference count on the
|
|
|
|
* specified inode by the specified amount. Under normal
|
|
|
|
* operation the count should always go down. Decrementing
|
|
|
|
* the count to zero will cause the inode to be freed.
|
2011-06-05 22:36:30 +00:00
|
|
|
* adjblkcnt(inode, amt) - adjust the number of blocks used by the
|
|
|
|
* inode by the specified amount.
|
The recomputation of file system summary at mount time can be a
very slow process, especially for large file systems that is just
recovered from a crash.
Since the summary is already re-sync'ed every 30 second, we will
not lag behind too much after a crash. With this consideration
in mind, it is more reasonable to transfer the responsibility to
background fsck, to reduce the delay after a crash.
Add a new sysctl variable, vfs.ffs.compute_summary_at_mount, to
control this behavior. When set to nonzero, we will get the
"old" behavior, that the summary is computed immediately at mount
time.
Add five new sysctl variables to adjust ndir, nbfree, nifree,
nffree and numclusters respectively. Teach fsck_ffs about these
API, however, intentionally not to check the existence, since
kernels without these sysctls must have recomputed the summary
and hence no adjustments are necessary.
This change has eliminated the usual tens of minutes of delay of
mounting large dirty volumes.
Reviewed by: mckusick
MFC After: 1 week
2005-02-20 08:02:15 +00:00
|
|
|
* adjndir, adjbfree, adjifree, adjffree, adjnumclusters(amt) -
|
|
|
|
* adjust the superblock summary.
|
2001-03-21 04:09:01 +00:00
|
|
|
* freedirs(inode, count) - directory inodes [inode..inode + count - 1]
|
|
|
|
* are marked as free. Inodes should never have to be marked
|
|
|
|
* as in use.
|
|
|
|
* freefiles(inode, count) - file inodes [inode..inode + count - 1]
|
|
|
|
* are marked as free. Inodes should never have to be marked
|
|
|
|
* as in use.
|
|
|
|
* freeblks(blockno, size) - blocks [blockno..blockno + size - 1]
|
|
|
|
* are marked as free. Blocks should never have to be marked
|
|
|
|
* as in use.
|
|
|
|
* setflags(flags, set/clear) - the fs_flags field has the specified
|
|
|
|
* flags set (second parameter +1) or cleared (second parameter -1).
|
2010-01-11 20:44:05 +00:00
|
|
|
* setcwd(dirinode) - set the current directory to dirinode in the
|
|
|
|
* filesystem associated with the snapshot.
|
|
|
|
* setdotdot(oldvalue, newvalue) - Verify that the inode number for ".."
|
|
|
|
* in the current directory is oldvalue then change it to newvalue.
|
|
|
|
* unlink(nameptr, oldvalue) - Verify that the inode number associated
|
|
|
|
* with nameptr in the current directory is oldvalue then unlink it.
|
2011-07-15 16:20:33 +00:00
|
|
|
*
|
|
|
|
* The following functions may only be used on a quiescent filesystem
|
|
|
|
* by the soft updates journal. They are not safe to be run on an active
|
|
|
|
* filesystem.
|
|
|
|
*
|
|
|
|
* setinode(inode, dip) - the specified disk inode is replaced with the
|
|
|
|
* contents pointed to by dip.
|
|
|
|
* setbufoutput(fd, flags) - output associated with the specified file
|
|
|
|
* descriptor (which must reference the character device supporting
|
|
|
|
* the filesystem) switches from using physio to running through the
|
|
|
|
* buffer cache when flags is set to 1. The descriptor reverts to
|
|
|
|
* physio for output when flags is set to zero.
|
2001-03-21 04:09:01 +00:00
|
|
|
*/
|
|
|
|
|
2002-03-19 22:40:48 +00:00
|
|
|
static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS);
|
2001-03-21 04:09:01 +00:00
|
|
|
|
|
|
|
SYSCTL_PROC(_vfs_ffs, FFS_ADJ_REFCNT, adjrefcnt, CTLFLAG_WR|CTLTYPE_STRUCT,
|
|
|
|
0, 0, sysctl_ffs_fsck, "S,fsck", "Adjust Inode Reference Count");
|
|
|
|
|
2005-02-10 12:20:08 +00:00
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_BLKCNT, adjblkcnt, CTLFLAG_WR,
|
2001-03-21 04:09:01 +00:00
|
|
|
sysctl_ffs_fsck, "Adjust Inode Used Blocks Count");
|
|
|
|
|
The recomputation of file system summary at mount time can be a
very slow process, especially for large file systems that is just
recovered from a crash.
Since the summary is already re-sync'ed every 30 second, we will
not lag behind too much after a crash. With this consideration
in mind, it is more reasonable to transfer the responsibility to
background fsck, to reduce the delay after a crash.
Add a new sysctl variable, vfs.ffs.compute_summary_at_mount, to
control this behavior. When set to nonzero, we will get the
"old" behavior, that the summary is computed immediately at mount
time.
Add five new sysctl variables to adjust ndir, nbfree, nifree,
nffree and numclusters respectively. Teach fsck_ffs about these
API, however, intentionally not to check the existence, since
kernels without these sysctls must have recomputed the summary
and hence no adjustments are necessary.
This change has eliminated the usual tens of minutes of delay of
mounting large dirty volumes.
Reviewed by: mckusick
MFC After: 1 week
2005-02-20 08:02:15 +00:00
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NDIR, adjndir, CTLFLAG_WR,
|
|
|
|
sysctl_ffs_fsck, "Adjust number of directories");
|
|
|
|
|
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NBFREE, adjnbfree, CTLFLAG_WR,
|
|
|
|
sysctl_ffs_fsck, "Adjust number of free blocks");
|
|
|
|
|
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NIFREE, adjnifree, CTLFLAG_WR,
|
|
|
|
sysctl_ffs_fsck, "Adjust number of free inodes");
|
|
|
|
|
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NFFREE, adjnffree, CTLFLAG_WR,
|
|
|
|
sysctl_ffs_fsck, "Adjust number of free frags");
|
|
|
|
|
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_ADJ_NUMCLUSTERS, adjnumclusters, CTLFLAG_WR,
|
|
|
|
sysctl_ffs_fsck, "Adjust number of free clusters");
|
|
|
|
|
2005-02-10 12:20:08 +00:00
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_DIR_FREE, freedirs, CTLFLAG_WR,
|
2001-03-21 04:09:01 +00:00
|
|
|
sysctl_ffs_fsck, "Free Range of Directory Inodes");
|
|
|
|
|
2005-02-10 12:20:08 +00:00
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_FILE_FREE, freefiles, CTLFLAG_WR,
|
2001-03-21 04:09:01 +00:00
|
|
|
sysctl_ffs_fsck, "Free Range of File Inodes");
|
|
|
|
|
2005-02-10 12:20:08 +00:00
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_BLK_FREE, freeblks, CTLFLAG_WR,
|
2001-03-21 04:09:01 +00:00
|
|
|
sysctl_ffs_fsck, "Free Range of Blocks");
|
|
|
|
|
2005-02-10 12:20:08 +00:00
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_SET_FLAGS, setflags, CTLFLAG_WR,
|
2001-03-21 04:09:01 +00:00
|
|
|
sysctl_ffs_fsck, "Change Filesystem Flags");
|
|
|
|
|
2010-01-11 20:44:05 +00:00
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_SET_CWD, setcwd, CTLFLAG_WR,
|
|
|
|
sysctl_ffs_fsck, "Set Current Working Directory");
|
|
|
|
|
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOTDOT, setdotdot, CTLFLAG_WR,
|
|
|
|
sysctl_ffs_fsck, "Change Value of .. Entry");
|
|
|
|
|
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR,
|
|
|
|
sysctl_ffs_fsck, "Unlink a Duplicate Name");
|
|
|
|
|
2011-07-15 16:20:33 +00:00
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR,
|
|
|
|
sysctl_ffs_fsck, "Update an On-Disk Inode");
|
|
|
|
|
|
|
|
static SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR,
|
|
|
|
sysctl_ffs_fsck, "Set Buffered Writing for Descriptor");
|
|
|
|
|
|
|
|
#define DEBUG 1
|
2001-03-21 04:09:01 +00:00
|
|
|
#ifdef DEBUG
|
2011-07-22 18:03:33 +00:00
|
|
|
static int fsckcmds = 0;
|
2001-03-21 04:09:01 +00:00
|
|
|
SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, "");
|
|
|
|
#endif /* DEBUG */
|
|
|
|
|
2011-07-15 16:20:33 +00:00
|
|
|
static int buffered_write(struct file *, struct uio *, struct ucred *,
|
|
|
|
int, struct thread *);
|
|
|
|
|
2001-03-21 04:09:01 +00:00
|
|
|
static int
|
|
|
|
sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS)
|
|
|
|
{
|
2010-01-11 20:44:05 +00:00
|
|
|
struct thread *td = curthread;
|
2001-03-21 04:09:01 +00:00
|
|
|
struct fsck_cmd cmd;
|
|
|
|
struct ufsmount *ump;
|
2015-07-11 16:19:11 +00:00
|
|
|
struct vnode *vp, *dvp, *fdvp;
|
2010-01-11 20:44:05 +00:00
|
|
|
struct inode *ip, *dp;
|
2001-03-21 04:09:01 +00:00
|
|
|
struct mount *mp;
|
|
|
|
struct fs *fs;
|
2002-06-21 06:18:05 +00:00
|
|
|
ufs2_daddr_t blkno;
|
2001-03-21 04:09:01 +00:00
|
|
|
long blkcnt, blksize;
|
2011-07-15 16:20:33 +00:00
|
|
|
struct file *fp, *vfp;
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_t rights;
|
2012-10-22 17:50:54 +00:00
|
|
|
int filetype, error;
|
2011-07-15 16:20:33 +00:00
|
|
|
static struct fileops *origops, bufferedops;
|
2001-03-21 04:09:01 +00:00
|
|
|
|
|
|
|
if (req->newlen > sizeof cmd)
|
|
|
|
return (EBADRPC);
|
|
|
|
if ((error = SYSCTL_IN(req, &cmd, sizeof cmd)) != 0)
|
|
|
|
return (error);
|
|
|
|
if (cmd.version != FFS_CMD_VERSION)
|
|
|
|
return (ERPCMISMATCH);
|
2015-06-16 13:09:18 +00:00
|
|
|
if ((error = getvnode(td, cmd.handle,
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_init(&rights, CAP_FSCK), &fp)) != 0)
|
2001-03-21 04:09:01 +00:00
|
|
|
return (error);
|
2010-01-11 20:44:05 +00:00
|
|
|
vp = fp->f_data;
|
|
|
|
if (vp->v_type != VREG && vp->v_type != VDIR) {
|
|
|
|
fdrop(fp, td);
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
|
|
|
vn_start_write(vp, &mp, V_WAIT);
|
2016-04-10 21:48:11 +00:00
|
|
|
if (mp == NULL ||
|
|
|
|
strncmp(mp->mnt_stat.f_fstypename, "ufs", MFSNAMELEN)) {
|
2001-04-17 05:06:37 +00:00
|
|
|
vn_finished_write(mp);
|
2010-01-11 20:44:05 +00:00
|
|
|
fdrop(fp, td);
|
2001-03-23 20:58:25 +00:00
|
|
|
return (EINVAL);
|
2001-04-17 05:06:37 +00:00
|
|
|
}
|
2011-07-15 16:20:33 +00:00
|
|
|
ump = VFSTOUFS(mp);
|
|
|
|
if ((mp->mnt_flag & MNT_RDONLY) &&
|
|
|
|
ump->um_fsckpid != td->td_proc->p_pid) {
|
2001-04-17 05:06:37 +00:00
|
|
|
vn_finished_write(mp);
|
2010-01-11 20:44:05 +00:00
|
|
|
fdrop(fp, td);
|
2001-03-21 04:09:01 +00:00
|
|
|
return (EROFS);
|
2001-04-17 05:06:37 +00:00
|
|
|
}
|
2001-03-21 04:09:01 +00:00
|
|
|
fs = ump->um_fs;
|
2018-03-17 12:59:55 +00:00
|
|
|
filetype = IFREG;
|
2001-03-21 04:09:01 +00:00
|
|
|
|
|
|
|
switch (oidp->oid_number) {
|
|
|
|
|
|
|
|
case FFS_SET_FLAGS:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds)
|
|
|
|
printf("%s: %s flags\n", mp->mnt_stat.f_mntonname,
|
|
|
|
cmd.size > 0 ? "set" : "clear");
|
|
|
|
#endif /* DEBUG */
|
|
|
|
if (cmd.size > 0)
|
|
|
|
fs->fs_flags |= (long)cmd.value;
|
|
|
|
else
|
|
|
|
fs->fs_flags &= ~(long)cmd.value;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case FFS_ADJ_REFCNT:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
2011-07-15 16:20:33 +00:00
|
|
|
printf("%s: adjust inode %jd link count by %jd\n",
|
2002-07-08 12:42:29 +00:00
|
|
|
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
|
|
|
|
(intmax_t)cmd.size);
|
2001-03-21 04:09:01 +00:00
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
2005-02-08 17:40:01 +00:00
|
|
|
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
|
2001-04-17 05:06:37 +00:00
|
|
|
break;
|
2001-03-21 04:09:01 +00:00
|
|
|
ip = VTOI(vp);
|
|
|
|
ip->i_nlink += cmd.size;
|
2004-07-28 06:41:27 +00:00
|
|
|
DIP_SET(ip, i_nlink, ip->i_nlink);
|
2001-03-21 04:09:01 +00:00
|
|
|
ip->i_effnlink += cmd.size;
|
2011-07-15 16:20:33 +00:00
|
|
|
ip->i_flag |= IN_CHANGE | IN_MODIFIED;
|
|
|
|
error = ffs_update(vp, 1);
|
2001-03-21 04:09:01 +00:00
|
|
|
if (DOINGSOFTDEP(vp))
|
|
|
|
softdep_change_linkcnt(ip);
|
|
|
|
vput(vp);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case FFS_ADJ_BLKCNT:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
2002-07-08 12:42:29 +00:00
|
|
|
printf("%s: adjust inode %jd block count by %jd\n",
|
|
|
|
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
|
|
|
|
(intmax_t)cmd.size);
|
2001-03-21 04:09:01 +00:00
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
2005-02-08 17:40:01 +00:00
|
|
|
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
|
2001-04-17 05:06:37 +00:00
|
|
|
break;
|
2001-03-21 04:09:01 +00:00
|
|
|
ip = VTOI(vp);
|
2004-07-28 06:41:27 +00:00
|
|
|
DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size);
|
2011-07-15 16:20:33 +00:00
|
|
|
ip->i_flag |= IN_CHANGE | IN_MODIFIED;
|
|
|
|
error = ffs_update(vp, 1);
|
2001-03-21 04:09:01 +00:00
|
|
|
vput(vp);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case FFS_DIR_FREE:
|
2018-03-17 12:59:55 +00:00
|
|
|
filetype = IFDIR;
|
2001-03-21 04:09:01 +00:00
|
|
|
/* fall through */
|
|
|
|
|
|
|
|
case FFS_FILE_FREE:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
if (cmd.size == 1)
|
2012-09-27 23:30:49 +00:00
|
|
|
printf("%s: free %s inode %ju\n",
|
2001-03-21 04:09:01 +00:00
|
|
|
mp->mnt_stat.f_mntonname,
|
2018-03-17 12:59:55 +00:00
|
|
|
filetype == IFDIR ? "directory" : "file",
|
2012-09-27 23:30:49 +00:00
|
|
|
(uintmax_t)cmd.value);
|
2001-03-21 04:09:01 +00:00
|
|
|
else
|
2012-09-27 23:30:49 +00:00
|
|
|
printf("%s: free %s inodes %ju-%ju\n",
|
2001-03-21 04:09:01 +00:00
|
|
|
mp->mnt_stat.f_mntonname,
|
2018-03-17 12:59:55 +00:00
|
|
|
filetype == IFDIR ? "directory" : "file",
|
2012-09-27 23:30:49 +00:00
|
|
|
(uintmax_t)cmd.value,
|
|
|
|
(uintmax_t)(cmd.value + cmd.size - 1));
|
2001-03-21 04:09:01 +00:00
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
while (cmd.size > 0) {
|
2005-01-24 10:08:35 +00:00
|
|
|
if ((error = ffs_freefile(ump, fs, ump->um_devvp,
|
2010-04-24 07:05:35 +00:00
|
|
|
cmd.value, filetype, NULL)))
|
2001-04-17 05:06:37 +00:00
|
|
|
break;
|
2001-03-21 04:09:01 +00:00
|
|
|
cmd.size -= 1;
|
|
|
|
cmd.value += 1;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
|
|
|
case FFS_BLK_FREE:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
if (cmd.size == 1)
|
2002-09-19 03:55:30 +00:00
|
|
|
printf("%s: free block %jd\n",
|
2001-03-21 04:09:01 +00:00
|
|
|
mp->mnt_stat.f_mntonname,
|
2002-06-21 06:18:05 +00:00
|
|
|
(intmax_t)cmd.value);
|
2001-03-21 04:09:01 +00:00
|
|
|
else
|
2002-09-19 03:55:30 +00:00
|
|
|
printf("%s: free blocks %jd-%jd\n",
|
2001-03-21 04:09:01 +00:00
|
|
|
mp->mnt_stat.f_mntonname,
|
2002-06-21 06:18:05 +00:00
|
|
|
(intmax_t)cmd.value,
|
|
|
|
(intmax_t)cmd.value + cmd.size - 1);
|
2001-03-21 04:09:01 +00:00
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
2002-06-21 06:18:05 +00:00
|
|
|
blkno = cmd.value;
|
2001-03-21 04:09:01 +00:00
|
|
|
blkcnt = cmd.size;
|
|
|
|
blksize = fs->fs_frag - (blkno % fs->fs_frag);
|
|
|
|
while (blkcnt > 0) {
|
|
|
|
if (blksize > blkcnt)
|
|
|
|
blksize = blkcnt;
|
2005-01-24 10:08:35 +00:00
|
|
|
ffs_blkfree(ump, fs, ump->um_devvp, blkno,
|
2017-02-15 19:50:26 +00:00
|
|
|
blksize * fs->fs_fsize, UFS_ROOTINO, VDIR, NULL);
|
2001-03-21 04:09:01 +00:00
|
|
|
blkno += blksize;
|
|
|
|
blkcnt -= blksize;
|
|
|
|
blksize = fs->fs_frag;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
|
The recomputation of file system summary at mount time can be a
very slow process, especially for large file systems that is just
recovered from a crash.
Since the summary is already re-sync'ed every 30 second, we will
not lag behind too much after a crash. With this consideration
in mind, it is more reasonable to transfer the responsibility to
background fsck, to reduce the delay after a crash.
Add a new sysctl variable, vfs.ffs.compute_summary_at_mount, to
control this behavior. When set to nonzero, we will get the
"old" behavior, that the summary is computed immediately at mount
time.
Add five new sysctl variables to adjust ndir, nbfree, nifree,
nffree and numclusters respectively. Teach fsck_ffs about these
API, however, intentionally not to check the existence, since
kernels without these sysctls must have recomputed the summary
and hence no adjustments are necessary.
This change has eliminated the usual tens of minutes of delay of
mounting large dirty volumes.
Reviewed by: mckusick
MFC After: 1 week
2005-02-20 08:02:15 +00:00
|
|
|
/*
|
|
|
|
* Adjust superblock summaries. fsck(8) is expected to
|
|
|
|
* submit deltas when necessary.
|
|
|
|
*/
|
|
|
|
case FFS_ADJ_NDIR:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
printf("%s: adjust number of directories by %jd\n",
|
|
|
|
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
fs->fs_cstotal.cs_ndir += cmd.value;
|
|
|
|
break;
|
2010-01-11 20:44:05 +00:00
|
|
|
|
The recomputation of file system summary at mount time can be a
very slow process, especially for large file systems that is just
recovered from a crash.
Since the summary is already re-sync'ed every 30 second, we will
not lag behind too much after a crash. With this consideration
in mind, it is more reasonable to transfer the responsibility to
background fsck, to reduce the delay after a crash.
Add a new sysctl variable, vfs.ffs.compute_summary_at_mount, to
control this behavior. When set to nonzero, we will get the
"old" behavior, that the summary is computed immediately at mount
time.
Add five new sysctl variables to adjust ndir, nbfree, nifree,
nffree and numclusters respectively. Teach fsck_ffs about these
API, however, intentionally not to check the existence, since
kernels without these sysctls must have recomputed the summary
and hence no adjustments are necessary.
This change has eliminated the usual tens of minutes of delay of
mounting large dirty volumes.
Reviewed by: mckusick
MFC After: 1 week
2005-02-20 08:02:15 +00:00
|
|
|
case FFS_ADJ_NBFREE:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
printf("%s: adjust number of free blocks by %+jd\n",
|
|
|
|
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
fs->fs_cstotal.cs_nbfree += cmd.value;
|
|
|
|
break;
|
2010-01-11 20:44:05 +00:00
|
|
|
|
The recomputation of file system summary at mount time can be a
very slow process, especially for large file systems that is just
recovered from a crash.
Since the summary is already re-sync'ed every 30 second, we will
not lag behind too much after a crash. With this consideration
in mind, it is more reasonable to transfer the responsibility to
background fsck, to reduce the delay after a crash.
Add a new sysctl variable, vfs.ffs.compute_summary_at_mount, to
control this behavior. When set to nonzero, we will get the
"old" behavior, that the summary is computed immediately at mount
time.
Add five new sysctl variables to adjust ndir, nbfree, nifree,
nffree and numclusters respectively. Teach fsck_ffs about these
API, however, intentionally not to check the existence, since
kernels without these sysctls must have recomputed the summary
and hence no adjustments are necessary.
This change has eliminated the usual tens of minutes of delay of
mounting large dirty volumes.
Reviewed by: mckusick
MFC After: 1 week
2005-02-20 08:02:15 +00:00
|
|
|
case FFS_ADJ_NIFREE:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
printf("%s: adjust number of free inodes by %+jd\n",
|
|
|
|
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
fs->fs_cstotal.cs_nifree += cmd.value;
|
|
|
|
break;
|
2010-01-11 20:44:05 +00:00
|
|
|
|
The recomputation of file system summary at mount time can be a
very slow process, especially for large file systems that is just
recovered from a crash.
Since the summary is already re-sync'ed every 30 second, we will
not lag behind too much after a crash. With this consideration
in mind, it is more reasonable to transfer the responsibility to
background fsck, to reduce the delay after a crash.
Add a new sysctl variable, vfs.ffs.compute_summary_at_mount, to
control this behavior. When set to nonzero, we will get the
"old" behavior, that the summary is computed immediately at mount
time.
Add five new sysctl variables to adjust ndir, nbfree, nifree,
nffree and numclusters respectively. Teach fsck_ffs about these
API, however, intentionally not to check the existence, since
kernels without these sysctls must have recomputed the summary
and hence no adjustments are necessary.
This change has eliminated the usual tens of minutes of delay of
mounting large dirty volumes.
Reviewed by: mckusick
MFC After: 1 week
2005-02-20 08:02:15 +00:00
|
|
|
case FFS_ADJ_NFFREE:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
printf("%s: adjust number of free frags by %+jd\n",
|
|
|
|
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
fs->fs_cstotal.cs_nffree += cmd.value;
|
|
|
|
break;
|
2010-01-11 20:44:05 +00:00
|
|
|
|
The recomputation of file system summary at mount time can be a
very slow process, especially for large file systems that is just
recovered from a crash.
Since the summary is already re-sync'ed every 30 second, we will
not lag behind too much after a crash. With this consideration
in mind, it is more reasonable to transfer the responsibility to
background fsck, to reduce the delay after a crash.
Add a new sysctl variable, vfs.ffs.compute_summary_at_mount, to
control this behavior. When set to nonzero, we will get the
"old" behavior, that the summary is computed immediately at mount
time.
Add five new sysctl variables to adjust ndir, nbfree, nifree,
nffree and numclusters respectively. Teach fsck_ffs about these
API, however, intentionally not to check the existence, since
kernels without these sysctls must have recomputed the summary
and hence no adjustments are necessary.
This change has eliminated the usual tens of minutes of delay of
mounting large dirty volumes.
Reviewed by: mckusick
MFC After: 1 week
2005-02-20 08:02:15 +00:00
|
|
|
case FFS_ADJ_NUMCLUSTERS:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
printf("%s: adjust number of free clusters by %+jd\n",
|
|
|
|
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
fs->fs_cstotal.cs_numclusters += cmd.value;
|
|
|
|
break;
|
|
|
|
|
2010-01-11 20:44:05 +00:00
|
|
|
case FFS_SET_CWD:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
printf("%s: set current directory to inode %jd\n",
|
|
|
|
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_SHARED, &vp)))
|
|
|
|
break;
|
|
|
|
AUDIT_ARG_VNODE1(vp);
|
|
|
|
if ((error = change_dir(vp, td)) != 0) {
|
|
|
|
vput(vp);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
VOP_UNLOCK(vp, 0);
|
2015-07-11 16:19:11 +00:00
|
|
|
pwd_chdir(td, vp);
|
2010-01-11 20:44:05 +00:00
|
|
|
break;
|
|
|
|
|
|
|
|
case FFS_SET_DOTDOT:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
printf("%s: change .. in cwd from %jd to %jd\n",
|
|
|
|
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value,
|
|
|
|
(intmax_t)cmd.size);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
/*
|
|
|
|
* First we have to get and lock the parent directory
|
|
|
|
* to which ".." points.
|
|
|
|
*/
|
|
|
|
error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &fdvp);
|
|
|
|
if (error)
|
|
|
|
break;
|
|
|
|
/*
|
|
|
|
* Now we get and lock the child directory containing "..".
|
|
|
|
*/
|
|
|
|
FILEDESC_SLOCK(td->td_proc->p_fd);
|
|
|
|
dvp = td->td_proc->p_fd->fd_cdir;
|
|
|
|
FILEDESC_SUNLOCK(td->td_proc->p_fd);
|
|
|
|
if ((error = vget(dvp, LK_EXCLUSIVE, td)) != 0) {
|
|
|
|
vput(fdvp);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
dp = VTOI(dvp);
|
|
|
|
dp->i_offset = 12; /* XXX mastertemplate.dot_reclen */
|
|
|
|
error = ufs_dirrewrite(dp, VTOI(fdvp), (ino_t)cmd.size,
|
|
|
|
DT_DIR, 0);
|
|
|
|
cache_purge(fdvp);
|
|
|
|
cache_purge(dvp);
|
|
|
|
vput(dvp);
|
|
|
|
vput(fdvp);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case FFS_UNLINK:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
char buf[32];
|
|
|
|
|
2010-01-11 22:42:06 +00:00
|
|
|
if (copyinstr((char *)(intptr_t)cmd.value, buf,32,NULL))
|
2010-01-11 20:44:05 +00:00
|
|
|
strncpy(buf, "Name_too_long", 32);
|
|
|
|
printf("%s: unlink %s (inode %jd)\n",
|
|
|
|
mp->mnt_stat.f_mntonname, buf, (intmax_t)cmd.size);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
/*
|
|
|
|
* kern_unlinkat will do its own start/finish writes and
|
|
|
|
* they do not nest, so drop ours here. Setting mp == NULL
|
|
|
|
* indicates that vn_finished_write is not needed down below.
|
|
|
|
*/
|
|
|
|
vn_finished_write(mp);
|
|
|
|
mp = NULL;
|
2010-01-11 22:42:06 +00:00
|
|
|
error = kern_unlinkat(td, AT_FDCWD, (char *)(intptr_t)cmd.value,
|
2010-01-11 20:44:05 +00:00
|
|
|
UIO_USERSPACE, (ino_t)cmd.size);
|
|
|
|
break;
|
|
|
|
|
2011-07-15 16:20:33 +00:00
|
|
|
case FFS_SET_INODE:
|
|
|
|
if (ump->um_fsckpid != td->td_proc->p_pid) {
|
|
|
|
error = EPERM;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
2011-07-22 18:03:33 +00:00
|
|
|
if (fsckcmds) {
|
2011-07-15 16:20:33 +00:00
|
|
|
printf("%s: update inode %jd\n",
|
|
|
|
mp->mnt_stat.f_mntonname, (intmax_t)cmd.value);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp)))
|
|
|
|
break;
|
|
|
|
AUDIT_ARG_VNODE1(vp);
|
|
|
|
ip = VTOI(vp);
|
2016-09-17 16:47:34 +00:00
|
|
|
if (I_IS_UFS1(ip))
|
2011-07-15 16:20:33 +00:00
|
|
|
error = copyin((void *)(intptr_t)cmd.size, ip->i_din1,
|
|
|
|
sizeof(struct ufs1_dinode));
|
|
|
|
else
|
|
|
|
error = copyin((void *)(intptr_t)cmd.size, ip->i_din2,
|
|
|
|
sizeof(struct ufs2_dinode));
|
|
|
|
if (error) {
|
|
|
|
vput(vp);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
ip->i_flag |= IN_CHANGE | IN_MODIFIED;
|
|
|
|
error = ffs_update(vp, 1);
|
|
|
|
vput(vp);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case FFS_SET_BUFOUTPUT:
|
|
|
|
if (ump->um_fsckpid != td->td_proc->p_pid) {
|
|
|
|
error = EPERM;
|
|
|
|
break;
|
|
|
|
}
|
2016-09-17 16:47:34 +00:00
|
|
|
if (ITOUMP(VTOI(vp)) != ump) {
|
2011-07-15 16:20:33 +00:00
|
|
|
error = EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
printf("%s: %s buffered output for descriptor %jd\n",
|
|
|
|
mp->mnt_stat.f_mntonname,
|
|
|
|
cmd.size == 1 ? "enable" : "disable",
|
|
|
|
(intmax_t)cmd.value);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
2015-06-16 13:09:18 +00:00
|
|
|
if ((error = getvnode(td, cmd.value,
|
Change the cap_rights_t type from uint64_t to a structure that we can extend
in the future in a backward compatible (API and ABI) way.
The cap_rights_t represents capability rights. We used to use one bit to
represent one right, but we are running out of spare bits. Currently the new
structure provides place for 114 rights (so 50 more than the previous
cap_rights_t), but it is possible to grow the structure to hold at least 285
rights, although we can make it even larger if 285 rights won't be enough.
The structure definition looks like this:
struct cap_rights {
uint64_t cr_rights[CAP_RIGHTS_VERSION + 2];
};
The initial CAP_RIGHTS_VERSION is 0.
The top two bits in the first element of the cr_rights[] array contain total
number of elements in the array - 2. This means if those two bits are equal to
0, we have 2 array elements.
The top two bits in all remaining array elements should be 0.
The next five bits in all array elements contain array index. Only one bit is
used and bit position in this five-bits range defines array index. This means
there can be at most five array elements in the future.
To define new right the CAPRIGHT() macro must be used. The macro takes two
arguments - an array index and a bit to set, eg.
#define CAP_PDKILL CAPRIGHT(1, 0x0000000000000800ULL)
We still support aliases that combine few rights, but the rights have to belong
to the same array element, eg:
#define CAP_LOOKUP CAPRIGHT(0, 0x0000000000000400ULL)
#define CAP_FCHMOD CAPRIGHT(0, 0x0000000000002000ULL)
#define CAP_FCHMODAT (CAP_FCHMOD | CAP_LOOKUP)
There is new API to manage the new cap_rights_t structure:
cap_rights_t *cap_rights_init(cap_rights_t *rights, ...);
void cap_rights_set(cap_rights_t *rights, ...);
void cap_rights_clear(cap_rights_t *rights, ...);
bool cap_rights_is_set(const cap_rights_t *rights, ...);
bool cap_rights_is_valid(const cap_rights_t *rights);
void cap_rights_merge(cap_rights_t *dst, const cap_rights_t *src);
void cap_rights_remove(cap_rights_t *dst, const cap_rights_t *src);
bool cap_rights_contains(const cap_rights_t *big, const cap_rights_t *little);
Capability rights to the cap_rights_init(), cap_rights_set(),
cap_rights_clear() and cap_rights_is_set() functions are provided by
separating them with commas, eg:
cap_rights_t rights;
cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FSTAT);
There is no need to terminate the list of rights, as those functions are
actually macros that take care of the termination, eg:
#define cap_rights_set(rights, ...) \
__cap_rights_set((rights), __VA_ARGS__, 0ULL)
void __cap_rights_set(cap_rights_t *rights, ...);
Thanks to using one bit as an array index we can assert in those functions that
there are no two rights belonging to different array elements provided
together. For example this is illegal and will be detected, because CAP_LOOKUP
belongs to element 0 and CAP_PDKILL to element 1:
cap_rights_init(&rights, CAP_LOOKUP | CAP_PDKILL);
Providing several rights that belongs to the same array's element this way is
correct, but is not advised. It should only be used for aliases definition.
This commit also breaks compatibility with some existing Capsicum system calls,
but I see no other way to do that. This should be fine as Capsicum is still
experimental and this change is not going to 9.x.
Sponsored by: The FreeBSD Foundation
2013-09-05 00:09:56 +00:00
|
|
|
cap_rights_init(&rights, CAP_FSCK), &vfp)) != 0)
|
2011-07-15 16:20:33 +00:00
|
|
|
break;
|
|
|
|
if (vfp->f_vnode->v_type != VCHR) {
|
|
|
|
fdrop(vfp, td);
|
|
|
|
error = EINVAL;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (origops == NULL) {
|
|
|
|
origops = vfp->f_ops;
|
|
|
|
bcopy((void *)origops, (void *)&bufferedops,
|
|
|
|
sizeof(bufferedops));
|
|
|
|
bufferedops.fo_write = buffered_write;
|
|
|
|
}
|
|
|
|
if (cmd.size == 1)
|
|
|
|
atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops,
|
|
|
|
(uintptr_t)&bufferedops);
|
|
|
|
else
|
|
|
|
atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops,
|
|
|
|
(uintptr_t)origops);
|
|
|
|
fdrop(vfp, td);
|
|
|
|
break;
|
|
|
|
|
2001-03-21 04:09:01 +00:00
|
|
|
default:
|
|
|
|
#ifdef DEBUG
|
|
|
|
if (fsckcmds) {
|
|
|
|
printf("Invalid request %d from fsck\n",
|
|
|
|
oidp->oid_number);
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
2001-04-17 05:06:37 +00:00
|
|
|
error = EINVAL;
|
|
|
|
break;
|
2001-03-21 04:09:01 +00:00
|
|
|
|
|
|
|
}
|
2010-01-11 20:44:05 +00:00
|
|
|
fdrop(fp, td);
|
2001-04-17 05:06:37 +00:00
|
|
|
vn_finished_write(mp);
|
|
|
|
return (error);
|
2001-03-21 04:09:01 +00:00
|
|
|
}
|
2011-07-15 16:20:33 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Function to switch a descriptor to use the buffer cache to stage
|
|
|
|
* its I/O. This is needed so that writes to the filesystem device
|
|
|
|
* will give snapshots a chance to copy modified blocks for which it
|
|
|
|
* needs to retain copies.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
buffered_write(fp, uio, active_cred, flags, td)
|
|
|
|
struct file *fp;
|
|
|
|
struct uio *uio;
|
|
|
|
struct ucred *active_cred;
|
|
|
|
int flags;
|
|
|
|
struct thread *td;
|
|
|
|
{
|
2013-02-10 10:17:33 +00:00
|
|
|
struct vnode *devvp, *vp;
|
2011-07-15 16:20:33 +00:00
|
|
|
struct inode *ip;
|
|
|
|
struct buf *bp;
|
|
|
|
struct fs *fs;
|
2013-02-10 10:17:33 +00:00
|
|
|
struct filedesc *fdp;
|
2012-10-22 17:50:54 +00:00
|
|
|
int error;
|
2011-07-15 16:20:33 +00:00
|
|
|
daddr_t lbn;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The devvp is associated with the /dev filesystem. To discover
|
|
|
|
* the filesystem with which the device is associated, we depend
|
|
|
|
* on the application setting the current directory to a location
|
|
|
|
* within the filesystem being written. Yes, this is an ugly hack.
|
|
|
|
*/
|
|
|
|
devvp = fp->f_vnode;
|
2013-02-10 10:17:33 +00:00
|
|
|
if (!vn_isdisk(devvp, NULL))
|
|
|
|
return (EINVAL);
|
|
|
|
fdp = td->td_proc->p_fd;
|
|
|
|
FILEDESC_SLOCK(fdp);
|
|
|
|
vp = fdp->fd_cdir;
|
|
|
|
vref(vp);
|
|
|
|
FILEDESC_SUNLOCK(fdp);
|
|
|
|
vn_lock(vp, LK_SHARED | LK_RETRY);
|
|
|
|
/*
|
|
|
|
* Check that the current directory vnode indeed belongs to
|
|
|
|
* UFS before trying to dereference UFS-specific v_data fields.
|
|
|
|
*/
|
|
|
|
if (vp->v_op != &ffs_vnodeops1 && vp->v_op != &ffs_vnodeops2) {
|
|
|
|
vput(vp);
|
2011-07-15 16:20:33 +00:00
|
|
|
return (EINVAL);
|
2013-02-10 10:17:33 +00:00
|
|
|
}
|
|
|
|
ip = VTOI(vp);
|
2016-09-17 16:47:34 +00:00
|
|
|
if (ITODEVVP(ip) != devvp) {
|
2013-02-10 10:17:33 +00:00
|
|
|
vput(vp);
|
|
|
|
return (EINVAL);
|
|
|
|
}
|
2016-09-17 16:47:34 +00:00
|
|
|
fs = ITOFS(ip);
|
2013-02-10 10:17:33 +00:00
|
|
|
vput(vp);
|
2012-07-02 21:01:03 +00:00
|
|
|
foffset_lock_uio(fp, uio, flags);
|
2011-07-15 16:20:33 +00:00
|
|
|
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
|
|
|
|
#ifdef DEBUG
|
2011-07-22 18:03:33 +00:00
|
|
|
if (fsckcmds) {
|
2011-07-15 16:20:33 +00:00
|
|
|
printf("%s: buffered write for block %jd\n",
|
|
|
|
fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset));
|
|
|
|
}
|
|
|
|
#endif /* DEBUG */
|
|
|
|
/*
|
|
|
|
* All I/O must be contained within a filesystem block, start on
|
|
|
|
* a fragment boundary, and be a multiple of fragments in length.
|
|
|
|
*/
|
|
|
|
if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) ||
|
|
|
|
fragoff(fs, uio->uio_offset) != 0 ||
|
|
|
|
fragoff(fs, uio->uio_resid) != 0) {
|
|
|
|
error = EINVAL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
lbn = numfrags(fs, uio->uio_offset);
|
|
|
|
bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0);
|
|
|
|
bp->b_flags |= B_RELBUF;
|
|
|
|
if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) {
|
|
|
|
brelse(bp);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
error = bwrite(bp);
|
|
|
|
out:
|
|
|
|
VOP_UNLOCK(devvp, 0);
|
2012-07-02 21:01:03 +00:00
|
|
|
foffset_unlock_uio(fp, uio, flags | FOF_NEXTOFF);
|
2011-07-15 16:20:33 +00:00
|
|
|
return (error);
|
|
|
|
}
|