1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Copyright (c) 1982, 1986, 1993
|
|
|
|
* The Regents of the University of California. All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
|
|
* must display the following acknowledgement:
|
|
|
|
* This product includes software developed by the University of
|
|
|
|
* California, Berkeley and its contributors.
|
|
|
|
* 4. Neither the name of the University nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
1997-02-10 02:22:35 +00:00
|
|
|
* @(#)fs.h 8.13 (Berkeley) 3/21/95
|
1999-08-28 01:08:13 +00:00
|
|
|
* $FreeBSD$
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
|
1994-08-21 07:03:56 +00:00
|
|
|
#ifndef _UFS_FFS_FS_H_
|
|
|
|
#define _UFS_FFS_FS_H_
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Each disk drive contains some number of filesystems.
|
|
|
|
* A filesystem consists of a number of cylinder groups.
|
1994-05-24 10:09:53 +00:00
|
|
|
* Each cylinder group has inodes and data.
|
|
|
|
*
|
2002-05-16 21:28:32 +00:00
|
|
|
* A filesystem is described by its super-block, which in turn
|
1994-05-24 10:09:53 +00:00
|
|
|
* describes the cylinder groups. The super-block is critical
|
|
|
|
* data and is replicated in each cylinder group to protect against
|
|
|
|
* catastrophic loss. This is done at `newfs' time and the critical
|
|
|
|
* super-block data does not change, so the copies need not be
|
|
|
|
* referenced further unless disaster strikes.
|
|
|
|
*
|
2002-05-16 21:28:32 +00:00
|
|
|
* For filesystem fs, the offsets of the various blocks of interest
|
1994-05-24 10:09:53 +00:00
|
|
|
* are given in the super block as:
|
|
|
|
* [fs->fs_sblkno] Super-block
|
|
|
|
* [fs->fs_cblkno] Cylinder group block
|
|
|
|
* [fs->fs_iblkno] Inode blocks
|
|
|
|
* [fs->fs_dblkno] Data blocks
|
|
|
|
* The beginning of cylinder group cg in fs, is given by
|
|
|
|
* the ``cgbase(fs, cg)'' macro.
|
|
|
|
*
|
2002-06-21 06:18:05 +00:00
|
|
|
* Depending on the architecture and the media, the superblock may
|
|
|
|
* reside in any one of four places. For tiny media where every block
|
|
|
|
* counts, it is placed at the very front of the partition. Historically,
|
|
|
|
* UFS1 placed it 8K from the front to leave room for the disk label and
|
|
|
|
* a small bootstrap. For UFS2 it got moved to 64K from the front to leave
|
|
|
|
* room for the disk label and a bigger bootstrap, and for really piggy
|
|
|
|
* systems we check at 256K from the front if the first three fail. In
|
|
|
|
* all cases the size of the superblock will be SBLOCKSIZE. All values are
|
|
|
|
* given in byte-offset form, so they do not imply a sector size. The
|
|
|
|
* SBLOCKSEARCH specifies the order in which the locations should be searched.
|
|
|
|
*/
|
|
|
|
#define SBLOCK_FLOPPY 0
|
|
|
|
#define SBLOCK_UFS1 8192
|
|
|
|
#define SBLOCK_UFS2 65536
|
|
|
|
#define SBLOCK_PIGGY 262144
|
|
|
|
#define SBLOCKSIZE 8192
|
|
|
|
#define SBLOCKSEARCH \
|
|
|
|
{ SBLOCK_UFS2, SBLOCK_UFS1, SBLOCK_FLOPPY, SBLOCK_PIGGY, -1 }
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Max number of fragments per block. This value is NOT tweakable.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2002-04-03 20:39:27 +00:00
|
|
|
#define MAXFRAG 8
|
2002-06-21 06:18:05 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Addresses stored in inodes are capable of addressing fragments
|
1995-05-30 08:16:23 +00:00
|
|
|
* of `blocks'. File system blocks of at most size MAXBSIZE can
|
1994-05-24 10:09:53 +00:00
|
|
|
* be optionally broken into 2, 4, or 8 pieces, each of which is
|
1996-01-30 23:02:38 +00:00
|
|
|
* addressable; these pieces may be DEV_BSIZE, or some multiple of
|
1994-05-24 10:09:53 +00:00
|
|
|
* a DEV_BSIZE unit.
|
|
|
|
*
|
|
|
|
* Large files consist of exclusively large data blocks. To avoid
|
|
|
|
* undue wasted disk space, the last data block of a small file may be
|
|
|
|
* allocated as only as many fragments of a large block as are
|
2002-05-16 21:28:32 +00:00
|
|
|
* necessary. The filesystem format retains only a single pointer
|
1994-05-24 10:09:53 +00:00
|
|
|
* to such a fragment, which is a piece of a single large block that
|
|
|
|
* has been divided. The size of such a fragment is determinable from
|
|
|
|
* information in the inode, using the ``blksize(fs, ip, lbn)'' macro.
|
|
|
|
*
|
2002-05-16 21:28:32 +00:00
|
|
|
* The filesystem records space availability at the fragment level;
|
1994-05-24 10:09:53 +00:00
|
|
|
* to determine block availability, aligned fragments are examined.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* MINBSIZE is the smallest allowable block size.
|
|
|
|
* In order to insure that it is possible to create files of size
|
|
|
|
* 2^32 with only two levels of indirection, MINBSIZE is set to 4096.
|
|
|
|
* MINBSIZE must be big enough to hold a cylinder group block,
|
|
|
|
* thus changes to (struct cg) must keep its size within MINBSIZE.
|
|
|
|
* Note that super blocks are always of size SBSIZE,
|
|
|
|
* and that both SBSIZE and MAXBSIZE must be >= MINBSIZE.
|
|
|
|
*/
|
|
|
|
#define MINBSIZE 4096
|
|
|
|
|
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* The path name on which the filesystem is mounted is maintained
|
1995-05-30 08:16:23 +00:00
|
|
|
* in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in
|
1994-05-24 10:09:53 +00:00
|
|
|
* the super block for this name.
|
1997-02-10 02:22:35 +00:00
|
|
|
*/
|
|
|
|
#define MAXMNTLEN 512
|
|
|
|
|
|
|
|
/*
|
2001-01-15 18:30:40 +00:00
|
|
|
* There is a 128-byte region in the superblock reserved for in-core
|
|
|
|
* pointers to summary information. Originally this included an array
|
2001-12-16 18:51:11 +00:00
|
|
|
* of pointers to blocks of struct csum; now there are just a few
|
2001-01-15 18:30:40 +00:00
|
|
|
* pointers and the remaining space is padded with fs_ocsp[].
|
|
|
|
*
|
|
|
|
* NOCSPTRS determines the size of this padding. One pointer (fs_csp)
|
|
|
|
* is taken away to point to a contiguous array of struct csum for
|
|
|
|
* all cylinder groups; a second (fs_maxcluster) points to an array
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
* of cluster sizes that is computed as cylinder groups are inspected,
|
|
|
|
* and the third points to an array that tracks the creation of new
|
2001-12-16 18:51:11 +00:00
|
|
|
* directories. A fourth pointer, fs_active, is used when creating
|
|
|
|
* snapshots; it points to a bitmap of cylinder groups for which the
|
|
|
|
* free-block bitmap has changed since the snapshot operation began.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2001-12-16 18:51:11 +00:00
|
|
|
#define NOCSPTRS ((128 / sizeof(void *)) - 4)
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* A summary of contiguous blocks of various sizes is maintained
|
|
|
|
* in each cylinder group. Normally this is set by the initial
|
|
|
|
* value of fs_maxcontig. To conserve space, a maximum summary size
|
|
|
|
* is set by FS_MAXCONTIG.
|
|
|
|
*/
|
|
|
|
#define FS_MAXCONTIG 16
|
|
|
|
|
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* MINFREE gives the minimum acceptable percentage of filesystem
|
1994-05-24 10:09:53 +00:00
|
|
|
* blocks which may be free. If the freelist drops below this level
|
|
|
|
* only the superuser may continue to allocate blocks. This may
|
|
|
|
* be set to 0 if no reserve of free blocks is deemed necessary,
|
2002-05-16 21:28:32 +00:00
|
|
|
* however throughput drops by fifty percent if the filesystem
|
1994-05-24 10:09:53 +00:00
|
|
|
* is run at between 95% and 100% full; thus the minimum default
|
|
|
|
* value of fs_minfree is 5%. However, to get good clustering
|
|
|
|
* performance, 10% is a better choice. hence we use 10% as our
|
|
|
|
* default value. With 10% free space, fragmentation is not a
|
|
|
|
* problem, so we choose to optimize for time.
|
|
|
|
*/
|
1995-03-10 22:18:16 +00:00
|
|
|
#define MINFREE 8
|
1994-05-24 10:09:53 +00:00
|
|
|
#define DEFAULTOPT FS_OPTTIME
|
|
|
|
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
/*
|
|
|
|
* Grigoriy Orlov <gluk@ptci.ru> has done some extensive work to fine
|
|
|
|
* tune the layout preferences for directories within a filesystem.
|
|
|
|
* His algorithm can be tuned by adjusting the following parameters
|
|
|
|
* which tell the system the average file size and the average number
|
|
|
|
* of files per directory. These defaults are well selected for typical
|
|
|
|
* filesystems, but may need to be tuned for odd cases like filesystems
|
|
|
|
* being used for sqiud caches or news spools.
|
|
|
|
*/
|
|
|
|
#define AVFILESIZ 16384 /* expected average file size */
|
|
|
|
#define AFPDIR 64 /* expected number of files per directory */
|
|
|
|
|
2000-07-04 04:58:34 +00:00
|
|
|
/*
|
|
|
|
* The maximum number of snapshot nodes that can be associated
|
|
|
|
* with each filesystem. This limit affects only the number of
|
|
|
|
* snapshot files that can be recorded within the superblock so
|
|
|
|
* that they can be found when the filesystem is mounted. However,
|
|
|
|
* maintaining too many will slow the filesystem performance, so
|
|
|
|
* having this limit is a good idea.
|
|
|
|
*/
|
|
|
|
#define FSMAXSNAP 20
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Used to identify special blocks in snapshots:
|
|
|
|
*
|
|
|
|
* BLK_NOCOPY - A block that was unallocated at the time the snapshot
|
|
|
|
* was taken, hence does not need to be copied when written.
|
|
|
|
* BLK_SNAP - A block held by another snapshot that is not needed by this
|
|
|
|
* snapshot. When the other snapshot is freed, the BLK_SNAP entries
|
|
|
|
* are converted to BLK_NOCOPY. These are needed to allow fsck to
|
|
|
|
* identify blocks that are in use by other snapshots (which are
|
|
|
|
* expunged from this snapshot).
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
#define BLK_NOCOPY ((ufs2_daddr_t)(1))
|
|
|
|
#define BLK_SNAP ((ufs2_daddr_t)(2))
|
2000-07-04 04:58:34 +00:00
|
|
|
|
2001-03-21 04:09:01 +00:00
|
|
|
/*
|
|
|
|
* Sysctl values for the fast filesystem.
|
|
|
|
*/
|
|
|
|
#define FFS_ADJ_REFCNT 1 /* adjust inode reference count */
|
|
|
|
#define FFS_ADJ_BLKCNT 2 /* adjust inode used block count */
|
|
|
|
#define FFS_BLK_FREE 3 /* free range of blocks in map */
|
|
|
|
#define FFS_DIR_FREE 4 /* free specified dir inodes in map */
|
|
|
|
#define FFS_FILE_FREE 5 /* free specified file inodes in map */
|
|
|
|
#define FFS_SET_FLAGS 6 /* set filesystem flags */
|
|
|
|
#define FFS_MAXID 7 /* number of valid ffs ids */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Command structure passed in to the filesystem to adjust filesystem values.
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
#define FFS_CMD_VERSION 0x19790518 /* version ID */
|
2001-03-21 04:09:01 +00:00
|
|
|
struct fsck_cmd {
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t version; /* version of command structure */
|
|
|
|
int32_t handle; /* reference to filesystem to be changed */
|
|
|
|
int64_t value; /* inode or block number to be affected */
|
|
|
|
int64_t size; /* amount or range to be adjusted */
|
|
|
|
int64_t spare; /* reserved for future use */
|
2001-03-21 04:09:01 +00:00
|
|
|
};
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Per cylinder group information; summarized in blocks allocated
|
|
|
|
* from first cylinder group data blocks. These blocks have to be
|
|
|
|
* read in from fs_csaddr (size fs_cssize) in addition to the
|
|
|
|
* super block.
|
|
|
|
*/
|
|
|
|
struct csum {
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t cs_ndir; /* number of directories */
|
|
|
|
int32_t cs_nbfree; /* number of free blocks */
|
|
|
|
int32_t cs_nifree; /* number of free inodes */
|
|
|
|
int32_t cs_nffree; /* number of free frags */
|
1994-05-24 10:09:53 +00:00
|
|
|
};
|
2002-06-21 06:18:05 +00:00
|
|
|
struct csum_total {
|
|
|
|
int64_t cs_ndir; /* number of directories */
|
|
|
|
int64_t cs_nbfree; /* number of free blocks */
|
|
|
|
int64_t cs_nifree; /* number of free inodes */
|
|
|
|
int64_t cs_nffree; /* number of free frags */
|
|
|
|
int64_t cs_numclusters; /* number of free clusters */
|
|
|
|
int64_t cs_spare[3]; /* future expansion */
|
|
|
|
};
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Super block for an FFS filesystem.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
struct fs {
|
2002-05-16 21:28:32 +00:00
|
|
|
int32_t fs_firstfield; /* historic filesystem linked list, */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_unused_1; /* used for incore super blocks */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_sblkno; /* offset of super-block in filesys */
|
|
|
|
int32_t fs_cblkno; /* offset of cyl-block in filesys */
|
|
|
|
int32_t fs_iblkno; /* offset of inode-blocks in filesys */
|
|
|
|
int32_t fs_dblkno; /* offset of first data after cg */
|
|
|
|
int32_t fs_old_cgoffset; /* cylinder group offset in cylinder */
|
|
|
|
int32_t fs_old_cgmask; /* used to calc mod fs_ntrak */
|
|
|
|
int32_t fs_old_time; /* last time written */
|
|
|
|
int32_t fs_old_size; /* number of blocks in fs */
|
|
|
|
int32_t fs_old_dsize; /* number of data blocks in fs */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_ncg; /* number of cylinder groups */
|
|
|
|
int32_t fs_bsize; /* size of basic blocks in fs */
|
|
|
|
int32_t fs_fsize; /* size of frag blocks in fs */
|
|
|
|
int32_t fs_frag; /* number of frags in a block in fs */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* these are configuration parameters */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_minfree; /* minimum percentage of free blocks */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_old_rotdelay; /* num of ms for optimal next block */
|
|
|
|
int32_t fs_old_rps; /* disk revolutions per second */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* these fields can be computed from the others */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_bmask; /* ``blkoff'' calc of blk offsets */
|
|
|
|
int32_t fs_fmask; /* ``fragoff'' calc of frag offsets */
|
|
|
|
int32_t fs_bshift; /* ``lblkno'' calc of logical blkno */
|
|
|
|
int32_t fs_fshift; /* ``numfrags'' calc number of frags */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* these are configuration parameters */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_maxcontig; /* max number of contiguous blks */
|
|
|
|
int32_t fs_maxbpg; /* max number of blks per cyl group */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* these fields can be computed from the others */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_fragshift; /* block to frag shift */
|
|
|
|
int32_t fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */
|
|
|
|
int32_t fs_sbsize; /* actual size of super block */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_spare1[2]; /* old fs_csmask */
|
|
|
|
/* old fs_csshift */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_nindir; /* value of NINDIR */
|
|
|
|
int32_t fs_inopb; /* value of INOPB */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_old_nspf; /* value of NSPF */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* yet another configuration parameter */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_optim; /* optimization preference, see below */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_old_npsect; /* # sectors/track including spares */
|
|
|
|
int32_t fs_old_interleave; /* hardware sector interleave */
|
|
|
|
int32_t fs_old_trackskew; /* sector 0 skew, per track */
|
1997-03-24 03:19:37 +00:00
|
|
|
int32_t fs_id[2]; /* unique filesystem id */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* sizes determined by number of cylinder groups and their sizes */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_old_csaddr; /* blk addr of cyl grp summary area */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_cssize; /* size of cyl grp summary area */
|
|
|
|
int32_t fs_cgsize; /* cylinder group size */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_spare2; /* old fs_ntrak */
|
|
|
|
int32_t fs_old_nsect; /* sectors per track */
|
|
|
|
int32_t fs_old_spc; /* sectors per cylinder */
|
|
|
|
int32_t fs_old_ncyl; /* cylinders in filesystem */
|
|
|
|
int32_t fs_old_cpg; /* cylinders per group */
|
|
|
|
int32_t fs_ipg; /* inodes per group */
|
|
|
|
int32_t fs_fpg; /* blocks per group * fs_frag */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* this data must be re-computed after crashes */
|
2002-06-21 06:18:05 +00:00
|
|
|
struct csum fs_old_cstotal; /* cylinder summary information */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* these fields are cleared at mount time */
|
1997-02-10 02:22:35 +00:00
|
|
|
int8_t fs_fmod; /* super block modified flag */
|
2002-05-16 21:28:32 +00:00
|
|
|
int8_t fs_clean; /* filesystem is clean flag */
|
1997-02-10 02:22:35 +00:00
|
|
|
int8_t fs_ronly; /* mounted read-only flag */
|
1998-03-08 09:59:44 +00:00
|
|
|
int8_t fs_flags; /* see FS_ flags below */
|
1997-02-10 02:22:35 +00:00
|
|
|
u_char fs_fsmnt[MAXMNTLEN]; /* name mounted on */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* these fields retain the current block allocation info */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_cgrotor; /* last cg searched */
|
2001-01-15 18:30:40 +00:00
|
|
|
void *fs_ocsp[NOCSPTRS]; /* padding; was list of fs_cs buffers */
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
u_int8_t *fs_contigdirs; /* # of contiguously allocated dirs */
|
2002-06-21 06:18:05 +00:00
|
|
|
struct csum *fs_csp; /* cg summary info buffer for fs_cs */
|
2001-01-15 18:30:40 +00:00
|
|
|
int32_t *fs_maxcluster; /* max cluster in each cyl group */
|
2002-01-17 08:33:32 +00:00
|
|
|
u_int *fs_active; /* used by snapshots to track fs */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_old_cpc; /* cyl per cycle in postbl */
|
|
|
|
int32_t fs_maxbsize; /* maximum blocking factor permitted */
|
|
|
|
int64_t fs_sparecon64[17]; /* old rotation block list head */
|
|
|
|
int64_t fs_sblockloc; /* location of standard superblock */
|
|
|
|
struct csum_total fs_cstotal; /* cylinder summary information */
|
|
|
|
ufs_time_t fs_time; /* last time written */
|
|
|
|
int64_t fs_size; /* number of blocks in fs */
|
|
|
|
int64_t fs_dsize; /* number of data blocks in fs */
|
|
|
|
ufs2_daddr_t fs_csaddr; /* blk addr of cyl grp summary area */
|
|
|
|
int64_t fs_pendingblocks; /* blocks in process of being freed */
|
|
|
|
int32_t fs_pendinginodes; /* inodes in process of being freed */
|
2000-07-04 04:58:34 +00:00
|
|
|
int32_t fs_snapinum[FSMAXSNAP];/* list of snapshot inode numbers */
|
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>.
His description of the problem and solution follow. My own tests show
speedups on typical filesystem intensive workloads of 5% to 12% which
is very impressive considering the small amount of code change involved.
------
One day I noticed that some file operations run much faster on
small file systems then on big ones. I've looked at the ffs
algorithms, thought about them, and redesigned the dirpref algorithm.
First I want to describe the results of my tests. These results are old
and I have improved the algorithm after these tests were done. Nevertheless
they show how big the perfomance speedup may be. I have done two file/directory
intensive tests on a two OpenBSD systems with old and new dirpref algorithm.
The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports".
The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release.
It contains 6596 directories and 13868 files. The test systems are:
1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for
test is at wd1. Size of test file system is 8 Gb, number of cg=991,
size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current
from Dec 2000 with BUFCACHEPERCENT=35
2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system
at wd0, file system for test is at wd1. Size of test file system is 40 Gb,
number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k
OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50
You can get more info about the test systems and methods at:
http://www.ptci.ru/gluk/dirpref/old/dirpref.html
Test Results
tar -xzf ports.tar.gz rm -rf ports
mode old dirpref new dirpref speedup old dirprefnew dirpref speedup
First system
normal 667 472 1.41 477 331 1.44
async 285 144 1.98 130 14 9.29
sync 768 616 1.25 477 334 1.43
softdep 413 252 1.64 241 38 6.34
Second system
normal 329 81 4.06 263.5 93.5 2.81
async 302 25.7 11.75 112 2.26 49.56
sync 281 57.0 4.93 263 90.5 2.9
softdep 341 40.6 8.4 284 4.76 59.66
"old dirpref" and "new dirpref" columns give a test time in seconds.
speedup - speed increasement in times, ie. old dirpref / new dirpref.
------
Algorithm description
The old dirpref algorithm is described in comments:
/*
* Find a cylinder to place a directory.
*
* The policy implemented by this algorithm is to select from
* among those cylinder groups with above the average number of
* free inodes, the one with the smallest number of directories.
*/
A new directory is allocated in a different cylinder groups than its
parent directory resulting in a directory tree that is spreaded across
all the cylinder groups. This spreading out results in a non-optimal
access to the directories and files. When we have a small filesystem
it is not a problem but when the filesystem is big then perfomance
degradation becomes very apparent.
What I mean by a big file system ?
1. A big filesystem is a filesystem which occupy 20-30 or more percent
of total drive space, i.e. first and last cylinder are physically
located relatively far from each other.
2. It has a relatively large number of cylinder groups, for example
more cylinder groups than 50% of the buffers in the buffer cache.
The first results in long access times, while the second results in
many buffers being used by metadata operations. Such operations use
cylinder group blocks and on-disk inode blocks. The cylinder group
block (fs->fs_cblkno) contains struct cg, inode and block bit maps.
It is 2k in size for the default filesystem parameters. If new and
parent directories are located in different cylinder groups then the
system performs more input/output operations and uses more buffers.
On filesystems with many cylinder groups, lots of cache buffers are
used for metadata operations.
My solution for this problem is very simple. I allocate many directories
in one cylinder group. I also do some things, so that the new allocation
method does not cause excessive fragmentation and all directory inodes
will not be located at a location far from its file's inodes and data.
The algorithm is:
/*
* Find a cylinder group to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
My early versions of dirpref give me a good results for a wide range of
file operations and different filesystem capacities except one case:
those applications that create their entire directory structure first
and only later fill this structure with files.
My solution for such and similar cases is to limit a number of
directories which may be created one after another in the same cylinder
group without intervening file creations. For this purpose, I allocate
an array of counters at mount time. This array is linked to the superblock
fs->fs_contigdirs[cg]. Each time a directory is created the counter
increases and each time a file is created the counter decreases. A 60Gb
filesystem with 8mb/cg requires 10kb of memory for the counters array.
The maxcontigdirs is a maximum number of directories which may be created
without an intervening file creation. I found in my tests that the best
performance occurs when I restrict the number of directories in one cylinder
group such that all its files may be located in the same cylinder group.
There may be some deterioration in performance if all the file inodes
are in the same cylinder group as its containing directory, but their
data partially resides in a different cylinder group. The maxcontigdirs
value is calculated to try to prevent this condition. Since there is
no way to know how many files and directories will be allocated later
I added two optimization parameters in superblock/tunefs. They are:
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
These parameters have reasonable defaults but may be tweeked for special
uses of a filesystem. They are only necessary in rare cases like better
tuning a filesystem being used to store a squid cache.
I have been using this algorithm for about 3 months. I have done
a lot of testing on filesystems with different capacities, average
filesize, average number of files per directory, and so on. I think
this algorithm has no negative impact on filesystem perfomance. It
works better than the default one in all cases. The new dirpref
will greatly improve untarring/removing/coping of big directories,
decrease load on cvs servers and much more. The new dirpref doesn't
speedup a compilation process, but also doesn't slow it down.
Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
|
|
|
int32_t fs_avgfilesize; /* expected average file size */
|
|
|
|
int32_t fs_avgfpdir; /* expected # of files per directory */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_save_cgsize; /* save real cg size to use fs_bsize */
|
|
|
|
int32_t fs_sparecon32[27]; /* reserved for future constants */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_contigsumsize; /* size of cluster summary array */
|
|
|
|
int32_t fs_maxsymlinklen; /* max length of an internal symlink */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_old_inodefmt; /* format of on-disk inodes */
|
1997-02-10 02:22:35 +00:00
|
|
|
u_int64_t fs_maxfilesize; /* maximum representable file size */
|
|
|
|
int64_t fs_qbmask; /* ~fs_bmask for use with 64-bit size */
|
|
|
|
int64_t fs_qfmask; /* ~fs_fmask for use with 64-bit size */
|
|
|
|
int32_t fs_state; /* validate fs_clean field */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t fs_old_postblformat; /* format of positional layout tables */
|
|
|
|
int32_t fs_old_nrpos; /* number of rotational positions */
|
|
|
|
int32_t fs_spare5[2]; /* old fs_postbloff */
|
|
|
|
/* old fs_rotbloff */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t fs_magic; /* magic number */
|
1994-05-24 10:09:53 +00:00
|
|
|
};
|
1997-02-10 02:22:35 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
1996-01-30 23:02:38 +00:00
|
|
|
* Filesystem identification
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
#define FS_UFS1_MAGIC 0x011954 /* UFS1 fast filesystem magic number */
|
|
|
|
#define FS_UFS2_MAGIC 0x19540119 /* UFS2 fast filesystem magic number */
|
1994-05-24 10:09:53 +00:00
|
|
|
#define FS_OKAY 0x7c269d38 /* superblock checksum */
|
|
|
|
#define FS_42INODEFMT -1 /* 4.2BSD inode format */
|
|
|
|
#define FS_44INODEFMT 2 /* 4.4BSD inode format */
|
1998-03-08 09:59:44 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Preference for optimization.
|
|
|
|
*/
|
|
|
|
#define FS_OPTTIME 0 /* minimize allocation time */
|
|
|
|
#define FS_OPTSPACE 1 /* minimize disk fragmentation */
|
|
|
|
|
1998-03-08 09:59:44 +00:00
|
|
|
/*
|
|
|
|
* Filesystem flags.
|
2001-04-14 05:26:28 +00:00
|
|
|
*
|
2002-06-21 06:18:05 +00:00
|
|
|
* The FS_UNCLEAN flag is set by the kernel when the filesystem was
|
|
|
|
* mounted with fs_clean set to zero. The FS_DOSOFTDEP flag indicates
|
|
|
|
* that the filesystem should be managed by the soft updates code.
|
2001-04-14 05:26:28 +00:00
|
|
|
* Note that the FS_NEEDSFSCK flag is set and cleared only by the
|
|
|
|
* fsck utility. It is set when background fsck finds an unexpected
|
|
|
|
* inconsistency which requires a traditional foreground fsck to be
|
|
|
|
* run. Such inconsistencies should only be found after an uncorrectable
|
|
|
|
* disk error. A foreground fsck will clear the FS_NEEDSFSCK flag when
|
|
|
|
* it has successfully cleaned up the filesystem. The kernel uses this
|
|
|
|
* flag to enforce that inconsistent filesystems be mounted read-only.
|
2002-06-21 06:18:05 +00:00
|
|
|
* The FS_INDEXDIRS flag when set indicates that the kernel maintains
|
|
|
|
* on-disk auxiliary indexes (such as B-trees) for speeding directory
|
|
|
|
* accesses. Kernels that do not support auxiliary indicies clear the
|
|
|
|
* flag to indicate that the indicies need to be rebuilt (by fsck) before
|
|
|
|
* they can be used.
|
2002-10-14 17:07:11 +00:00
|
|
|
*
|
|
|
|
* FS_ACLS indicates that ACLs are administratively enabled for the
|
|
|
|
* file system, so they should be loaded from extended attributes,
|
|
|
|
* observed for access control purposes, and be administered by object
|
|
|
|
* owners. FS_MULTILABEL indicates that the TrustedBSD MAC Framework
|
|
|
|
* should attempt to back MAC labels into extended attributes on the
|
|
|
|
* file system rather than maintain a single mount label for all
|
|
|
|
* objects.
|
1998-03-08 09:59:44 +00:00
|
|
|
*/
|
2001-04-14 05:26:28 +00:00
|
|
|
#define FS_UNCLEAN 0x01 /* filesystem not clean at mount */
|
|
|
|
#define FS_DOSOFTDEP 0x02 /* filesystem using soft dependencies */
|
|
|
|
#define FS_NEEDSFSCK 0x04 /* filesystem needs sync fsck before mount */
|
2002-06-21 06:18:05 +00:00
|
|
|
#define FS_INDEXDIRS 0x08 /* kernel supports indexed directories */
|
2002-10-14 17:07:11 +00:00
|
|
|
#define FS_ACLS 0x10 /* file system has ACLs enabled */
|
|
|
|
#define FS_MULTILABEL 0x20 /* file system is MAC multi-label */
|
1994-05-24 10:09:53 +00:00
|
|
|
|
2001-12-18 18:05:17 +00:00
|
|
|
/*
|
|
|
|
* Macros to access bits in the fs_active array.
|
|
|
|
*/
|
|
|
|
#define ACTIVECGNUM(fs, cg) ((fs)->fs_active[(cg) / (NBBY * sizeof(int))])
|
2002-01-17 08:33:32 +00:00
|
|
|
#define ACTIVECGOFF(cg) (1 << ((cg) % (NBBY * sizeof(int))))
|
2001-12-18 18:05:17 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* The size of a cylinder group is calculated by CGSIZE. The maximum size
|
|
|
|
* is limited by the fact that cylinder groups are at most one block.
|
1995-05-30 08:16:23 +00:00
|
|
|
* Its size is derived from the size of the maps maintained in the
|
1994-05-24 10:09:53 +00:00
|
|
|
* cylinder group and the (struct cg) size.
|
|
|
|
*/
|
|
|
|
#define CGSIZE(fs) \
|
1997-02-10 02:22:35 +00:00
|
|
|
/* base cg */ (sizeof(struct cg) + sizeof(int32_t) + \
|
2002-06-21 06:18:05 +00:00
|
|
|
/* old btotoff */ (fs)->fs_old_cpg * sizeof(int32_t) + \
|
|
|
|
/* old boff */ (fs)->fs_old_cpg * sizeof(u_int16_t) + \
|
1994-05-24 10:09:53 +00:00
|
|
|
/* inode map */ howmany((fs)->fs_ipg, NBBY) + \
|
2002-06-21 06:18:05 +00:00
|
|
|
/* block map */ howmany((fs)->fs_fpg, NBBY) +\
|
1994-05-24 10:09:53 +00:00
|
|
|
/* if present */ ((fs)->fs_contigsumsize <= 0 ? 0 : \
|
1997-02-10 02:22:35 +00:00
|
|
|
/* cluster sum */ (fs)->fs_contigsumsize * sizeof(int32_t) + \
|
2002-06-21 06:18:05 +00:00
|
|
|
/* cluster map */ howmany(fragstoblks(fs, (fs)->fs_fpg), NBBY)))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The minimal number of cylinder groups that should be created.
|
|
|
|
*/
|
|
|
|
#define MINCYLGRPS 4
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Convert cylinder group to base address of its global summary info.
|
|
|
|
*/
|
2001-01-15 18:30:40 +00:00
|
|
|
#define fs_cs(fs, indx) fs_csp[indx]
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Cylinder group block for a filesystem.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
#define CG_MAGIC 0x090255
|
1997-02-10 02:22:35 +00:00
|
|
|
struct cg {
|
|
|
|
int32_t cg_firstfield; /* historic cyl groups linked list */
|
|
|
|
int32_t cg_magic; /* magic number */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t cg_old_time; /* time last written */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t cg_cgx; /* we are the cgx'th cylinder group */
|
2002-06-21 06:18:05 +00:00
|
|
|
int16_t cg_old_ncyl; /* number of cyl's this cg */
|
|
|
|
int16_t cg_old_niblk; /* number of inode blocks this cg */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t cg_ndblk; /* number of data blocks this cg */
|
1994-05-24 10:09:53 +00:00
|
|
|
struct csum cg_cs; /* cylinder summary information */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t cg_rotor; /* position of last used block */
|
|
|
|
int32_t cg_frotor; /* position of last used frag */
|
|
|
|
int32_t cg_irotor; /* position of last used inode */
|
|
|
|
int32_t cg_frsum[MAXFRAG]; /* counts of available frags */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t cg_old_btotoff; /* (int32) block totals per cylinder */
|
|
|
|
int32_t cg_old_boff; /* (u_int16) free block positions */
|
1997-02-10 02:22:35 +00:00
|
|
|
int32_t cg_iusedoff; /* (u_int8) used inode map */
|
|
|
|
int32_t cg_freeoff; /* (u_int8) free block map */
|
|
|
|
int32_t cg_nextfreeoff; /* (u_int8) next available space */
|
|
|
|
int32_t cg_clustersumoff; /* (u_int32) counts of avail clusters */
|
|
|
|
int32_t cg_clusteroff; /* (u_int8) free cluster map */
|
|
|
|
int32_t cg_nclusterblks; /* number of clusters this cg */
|
2002-06-21 06:18:05 +00:00
|
|
|
int32_t cg_niblk; /* number of inode blocks this cg */
|
|
|
|
int32_t cg_initediblk; /* last initialized inode */
|
|
|
|
int32_t cg_sparecon32[3]; /* reserved for future use */
|
|
|
|
ufs_time_t cg_time; /* time last written */
|
|
|
|
int64_t cg_sparecon64[3]; /* reserved for future use */
|
1997-02-10 02:22:35 +00:00
|
|
|
u_int8_t cg_space[1]; /* space for cylinder group maps */
|
1994-05-24 10:09:53 +00:00
|
|
|
/* actually longer */
|
|
|
|
};
|
1997-02-10 02:22:35 +00:00
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
/*
|
|
|
|
* Macros for access to cylinder group array structures
|
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
#define cg_chkmagic(cgp) ((cgp)->cg_magic == CG_MAGIC)
|
1994-05-24 10:09:53 +00:00
|
|
|
#define cg_inosused(cgp) \
|
2002-06-21 06:18:05 +00:00
|
|
|
((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_iusedoff))
|
1994-05-24 10:09:53 +00:00
|
|
|
#define cg_blksfree(cgp) \
|
2002-06-21 06:18:05 +00:00
|
|
|
((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_freeoff))
|
1994-05-24 10:09:53 +00:00
|
|
|
#define cg_clustersfree(cgp) \
|
1997-02-10 02:22:35 +00:00
|
|
|
((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_clusteroff))
|
1994-05-24 10:09:53 +00:00
|
|
|
#define cg_clustersum(cgp) \
|
1997-02-10 02:22:35 +00:00
|
|
|
((int32_t *)((u_int8_t *)(cgp) + (cgp)->cg_clustersumoff))
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Turn filesystem block numbers into disk block addresses.
|
|
|
|
* This maps filesystem blocks to device size blocks.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
#define fsbtodb(fs, b) ((b) << (fs)->fs_fsbtodb)
|
|
|
|
#define dbtofsb(fs, b) ((b) >> (fs)->fs_fsbtodb)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Cylinder group macros to locate things in cylinder groups.
|
2002-05-16 21:28:32 +00:00
|
|
|
* They calc filesystem addresses of cylinder group data structures.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
2002-06-21 06:18:05 +00:00
|
|
|
#define cgbase(fs, c) ((ufs2_daddr_t)((fs)->fs_fpg * (c)))
|
1994-05-24 10:09:53 +00:00
|
|
|
#define cgdmin(fs, c) (cgstart(fs, c) + (fs)->fs_dblkno) /* 1st data */
|
|
|
|
#define cgimin(fs, c) (cgstart(fs, c) + (fs)->fs_iblkno) /* inode blk */
|
|
|
|
#define cgsblock(fs, c) (cgstart(fs, c) + (fs)->fs_sblkno) /* super blk */
|
|
|
|
#define cgtod(fs, c) (cgstart(fs, c) + (fs)->fs_cblkno) /* cg block */
|
|
|
|
#define cgstart(fs, c) \
|
2002-06-21 06:18:05 +00:00
|
|
|
((fs)->fs_magic == FS_UFS2_MAGIC ? cgbase(fs, c) : \
|
|
|
|
(cgbase(fs, c) + (fs)->fs_old_cgoffset * ((c) & ~((fs)->fs_old_cgmask))))
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Macros for handling inode numbers:
|
2002-05-16 21:28:32 +00:00
|
|
|
* inode number to filesystem block offset.
|
1994-05-24 10:09:53 +00:00
|
|
|
* inode number to cylinder group number.
|
2002-05-16 21:28:32 +00:00
|
|
|
* inode number to filesystem block address.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
#define ino_to_cg(fs, x) ((x) / (fs)->fs_ipg)
|
|
|
|
#define ino_to_fsba(fs, x) \
|
2002-06-21 06:18:05 +00:00
|
|
|
((ufs2_daddr_t)(cgimin(fs, ino_to_cg(fs, x)) + \
|
1994-05-24 10:09:53 +00:00
|
|
|
(blkstofrags((fs), (((x) % (fs)->fs_ipg) / INOPB(fs))))))
|
|
|
|
#define ino_to_fsbo(fs, x) ((x) % INOPB(fs))
|
|
|
|
|
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Give cylinder group number for a filesystem block.
|
|
|
|
* Give cylinder group block number for a filesystem block.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
#define dtog(fs, d) ((d) / (fs)->fs_fpg)
|
|
|
|
#define dtogd(fs, d) ((d) % (fs)->fs_fpg)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Extract the bits for a block from a map.
|
|
|
|
* Compute the cylinder and rotational position of a cyl block addr.
|
|
|
|
*/
|
|
|
|
#define blkmap(fs, map, loc) \
|
|
|
|
(((map)[(loc) / NBBY] >> ((loc) % NBBY)) & (0xff >> (NBBY - (fs)->fs_frag)))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The following macros optimize certain frequently calculated
|
|
|
|
* quantities by using shifts and masks in place of divisions
|
|
|
|
* modulos and multiplications.
|
|
|
|
*/
|
|
|
|
#define blkoff(fs, loc) /* calculates (loc % fs->fs_bsize) */ \
|
|
|
|
((loc) & (fs)->fs_qbmask)
|
|
|
|
#define fragoff(fs, loc) /* calculates (loc % fs->fs_fsize) */ \
|
|
|
|
((loc) & (fs)->fs_qfmask)
|
2002-06-21 06:18:05 +00:00
|
|
|
#define lfragtosize(fs, frag) /* calculates ((off_t)frag * fs->fs_fsize) */ \
|
|
|
|
((off_t)(frag) << (fs)->fs_fshift)
|
1996-10-12 22:12:51 +00:00
|
|
|
#define lblktosize(fs, blk) /* calculates ((off_t)blk * fs->fs_bsize) */ \
|
|
|
|
((off_t)(blk) << (fs)->fs_bshift)
|
|
|
|
/* Use this only when `blk' is known to be small, e.g., < NDADDR. */
|
|
|
|
#define smalllblktosize(fs, blk) /* calculates (blk * fs->fs_bsize) */ \
|
1994-05-24 10:09:53 +00:00
|
|
|
((blk) << (fs)->fs_bshift)
|
|
|
|
#define lblkno(fs, loc) /* calculates (loc / fs->fs_bsize) */ \
|
|
|
|
((loc) >> (fs)->fs_bshift)
|
|
|
|
#define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \
|
|
|
|
((loc) >> (fs)->fs_fshift)
|
|
|
|
#define blkroundup(fs, size) /* calculates roundup(size, fs->fs_bsize) */ \
|
|
|
|
(((size) + (fs)->fs_qbmask) & (fs)->fs_bmask)
|
|
|
|
#define fragroundup(fs, size) /* calculates roundup(size, fs->fs_fsize) */ \
|
|
|
|
(((size) + (fs)->fs_qfmask) & (fs)->fs_fmask)
|
|
|
|
#define fragstoblks(fs, frags) /* calculates (frags / fs->fs_frag) */ \
|
|
|
|
((frags) >> (fs)->fs_fragshift)
|
|
|
|
#define blkstofrags(fs, blks) /* calculates (blks * fs->fs_frag) */ \
|
|
|
|
((blks) << (fs)->fs_fragshift)
|
|
|
|
#define fragnum(fs, fsb) /* calculates (fsb % fs->fs_frag) */ \
|
|
|
|
((fsb) & ((fs)->fs_frag - 1))
|
|
|
|
#define blknum(fs, fsb) /* calculates rounddown(fsb, fs->fs_frag) */ \
|
|
|
|
((fsb) &~ ((fs)->fs_frag - 1))
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Determine the number of available frags given a
|
1997-02-10 02:22:35 +00:00
|
|
|
* percentage to hold in reserve.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
#define freespace(fs, percentreserved) \
|
|
|
|
(blkstofrags((fs), (fs)->fs_cstotal.cs_nbfree) + \
|
2000-03-17 03:44:47 +00:00
|
|
|
(fs)->fs_cstotal.cs_nffree - \
|
|
|
|
((off_t)((fs)->fs_dsize) * (percentreserved) / 100))
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Determining the size of a file block in the filesystem.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
#define blksize(fs, ip, lbn) \
|
1996-10-12 22:12:51 +00:00
|
|
|
(((lbn) >= NDADDR || (ip)->i_size >= smalllblktosize(fs, (lbn) + 1)) \
|
1994-05-24 10:09:53 +00:00
|
|
|
? (fs)->fs_bsize \
|
|
|
|
: (fragroundup(fs, blkoff(fs, (ip)->i_size))))
|
1998-03-08 09:59:44 +00:00
|
|
|
#define sblksize(fs, size, lbn) \
|
|
|
|
(((lbn) >= NDADDR || (size) >= ((lbn) + 1) << (fs)->fs_bshift) \
|
|
|
|
? (fs)->fs_bsize \
|
|
|
|
: (fragroundup(fs, blkoff(fs, (size)))))
|
|
|
|
|
1994-05-24 10:09:53 +00:00
|
|
|
|
|
|
|
/*
|
1997-02-10 02:22:35 +00:00
|
|
|
* Number of inodes in a secondary storage block/fragment.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
#define INOPB(fs) ((fs)->fs_inopb)
|
|
|
|
#define INOPF(fs) ((fs)->fs_inopb >> (fs)->fs_fragshift)
|
|
|
|
|
|
|
|
/*
|
2002-05-16 21:28:32 +00:00
|
|
|
* Number of indirects in a filesystem block.
|
1994-05-24 10:09:53 +00:00
|
|
|
*/
|
|
|
|
#define NINDIR(fs) ((fs)->fs_nindir)
|
|
|
|
|
|
|
|
extern int inside[], around[];
|
|
|
|
extern u_char *fragtbl[];
|
1994-08-21 07:03:56 +00:00
|
|
|
|
|
|
|
#endif
|