598 lines
25 KiB
C
Raw Normal View History

1994-05-24 10:09:53 +00:00
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)fs.h 8.13 (Berkeley) 3/21/95
1999-08-28 01:08:13 +00:00
* $FreeBSD$
1994-05-24 10:09:53 +00:00
*/
#ifndef _UFS_FFS_FS_H_
#define _UFS_FFS_FS_H_
1994-05-24 10:09:53 +00:00
/*
* Each disk drive contains some number of file systems.
* A file system consists of a number of cylinder groups.
* Each cylinder group has inodes and data.
*
* A file system is described by its super-block, which in turn
* describes the cylinder groups. The super-block is critical
* data and is replicated in each cylinder group to protect against
* catastrophic loss. This is done at `newfs' time and the critical
* super-block data does not change, so the copies need not be
* referenced further unless disaster strikes.
*
* For file system fs, the offsets of the various blocks of interest
* are given in the super block as:
* [fs->fs_sblkno] Super-block
* [fs->fs_cblkno] Cylinder group block
* [fs->fs_iblkno] Inode blocks
* [fs->fs_dblkno] Data blocks
* The beginning of cylinder group cg in fs, is given by
* the ``cgbase(fs, cg)'' macro.
*
* The first boot and super blocks are given in absolute disk addresses.
* The byte-offset forms are preferred, as they don't imply a sector size.
*/
#define BBSIZE 8192
#define SBSIZE 8192
#define BBOFF ((off_t)(0))
#define SBOFF ((off_t)(BBOFF + BBSIZE))
#define BBLOCK ((ufs_daddr_t)(0))
#define SBLOCK ((ufs_daddr_t)(BBLOCK + BBSIZE / DEV_BSIZE))
1994-05-24 10:09:53 +00:00
/*
* Addresses stored in inodes are capable of addressing fragments
1995-05-30 08:16:23 +00:00
* of `blocks'. File system blocks of at most size MAXBSIZE can
1994-05-24 10:09:53 +00:00
* be optionally broken into 2, 4, or 8 pieces, each of which is
* addressable; these pieces may be DEV_BSIZE, or some multiple of
1994-05-24 10:09:53 +00:00
* a DEV_BSIZE unit.
*
* Large files consist of exclusively large data blocks. To avoid
* undue wasted disk space, the last data block of a small file may be
* allocated as only as many fragments of a large block as are
* necessary. The file system format retains only a single pointer
* to such a fragment, which is a piece of a single large block that
* has been divided. The size of such a fragment is determinable from
* information in the inode, using the ``blksize(fs, ip, lbn)'' macro.
*
* The file system records space availability at the fragment level;
* to determine block availability, aligned fragments are examined.
*/
/*
* MINBSIZE is the smallest allowable block size.
* In order to insure that it is possible to create files of size
* 2^32 with only two levels of indirection, MINBSIZE is set to 4096.
* MINBSIZE must be big enough to hold a cylinder group block,
* thus changes to (struct cg) must keep its size within MINBSIZE.
* Note that super blocks are always of size SBSIZE,
* and that both SBSIZE and MAXBSIZE must be >= MINBSIZE.
*/
#define MINBSIZE 4096
/*
* The path name on which the file system is mounted is maintained
1995-05-30 08:16:23 +00:00
* in fs_fsmnt. MAXMNTLEN defines the amount of space allocated in
1994-05-24 10:09:53 +00:00
* the super block for this name.
*/
#define MAXMNTLEN 512
/*
* There is a 128-byte region in the superblock reserved for in-core
* pointers to summary information. Originally this included an array
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>. His description of the problem and solution follow. My own tests show speedups on typical filesystem intensive workloads of 5% to 12% which is very impressive considering the small amount of code change involved. ------ One day I noticed that some file operations run much faster on small file systems then on big ones. I've looked at the ffs algorithms, thought about them, and redesigned the dirpref algorithm. First I want to describe the results of my tests. These results are old and I have improved the algorithm after these tests were done. Nevertheless they show how big the perfomance speedup may be. I have done two file/directory intensive tests on a two OpenBSD systems with old and new dirpref algorithm. The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports". The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release. It contains 6596 directories and 13868 files. The test systems are: 1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for test is at wd1. Size of test file system is 8 Gb, number of cg=991, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=35 2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system at wd0, file system for test is at wd1. Size of test file system is 40 Gb, number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50 You can get more info about the test systems and methods at: http://www.ptci.ru/gluk/dirpref/old/dirpref.html Test Results tar -xzf ports.tar.gz rm -rf ports mode old dirpref new dirpref speedup old dirprefnew dirpref speedup First system normal 667 472 1.41 477 331 1.44 async 285 144 1.98 130 14 9.29 sync 768 616 1.25 477 334 1.43 softdep 413 252 1.64 241 38 6.34 Second system normal 329 81 4.06 263.5 93.5 2.81 async 302 25.7 11.75 112 2.26 49.56 sync 281 57.0 4.93 263 90.5 2.9 softdep 341 40.6 8.4 284 4.76 59.66 "old dirpref" and "new dirpref" columns give a test time in seconds. speedup - speed increasement in times, ie. old dirpref / new dirpref. ------ Algorithm description The old dirpref algorithm is described in comments: /* * Find a cylinder to place a directory. * * The policy implemented by this algorithm is to select from * among those cylinder groups with above the average number of * free inodes, the one with the smallest number of directories. */ A new directory is allocated in a different cylinder groups than its parent directory resulting in a directory tree that is spreaded across all the cylinder groups. This spreading out results in a non-optimal access to the directories and files. When we have a small filesystem it is not a problem but when the filesystem is big then perfomance degradation becomes very apparent. What I mean by a big file system ? 1. A big filesystem is a filesystem which occupy 20-30 or more percent of total drive space, i.e. first and last cylinder are physically located relatively far from each other. 2. It has a relatively large number of cylinder groups, for example more cylinder groups than 50% of the buffers in the buffer cache. The first results in long access times, while the second results in many buffers being used by metadata operations. Such operations use cylinder group blocks and on-disk inode blocks. The cylinder group block (fs->fs_cblkno) contains struct cg, inode and block bit maps. It is 2k in size for the default filesystem parameters. If new and parent directories are located in different cylinder groups then the system performs more input/output operations and uses more buffers. On filesystems with many cylinder groups, lots of cache buffers are used for metadata operations. My solution for this problem is very simple. I allocate many directories in one cylinder group. I also do some things, so that the new allocation method does not cause excessive fragmentation and all directory inodes will not be located at a location far from its file's inodes and data. The algorithm is: /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ My early versions of dirpref give me a good results for a wide range of file operations and different filesystem capacities except one case: those applications that create their entire directory structure first and only later fill this structure with files. My solution for such and similar cases is to limit a number of directories which may be created one after another in the same cylinder group without intervening file creations. For this purpose, I allocate an array of counters at mount time. This array is linked to the superblock fs->fs_contigdirs[cg]. Each time a directory is created the counter increases and each time a file is created the counter decreases. A 60Gb filesystem with 8mb/cg requires 10kb of memory for the counters array. The maxcontigdirs is a maximum number of directories which may be created without an intervening file creation. I found in my tests that the best performance occurs when I restrict the number of directories in one cylinder group such that all its files may be located in the same cylinder group. There may be some deterioration in performance if all the file inodes are in the same cylinder group as its containing directory, but their data partially resides in a different cylinder group. The maxcontigdirs value is calculated to try to prevent this condition. Since there is no way to know how many files and directories will be allocated later I added two optimization parameters in superblock/tunefs. They are: int32_t fs_avgfilesize; /* expected average file size */ int32_t fs_avgfpdir; /* expected # of files per directory */ These parameters have reasonable defaults but may be tweeked for special uses of a filesystem. They are only necessary in rare cases like better tuning a filesystem being used to store a squid cache. I have been using this algorithm for about 3 months. I have done a lot of testing on filesystems with different capacities, average filesize, average number of files per directory, and so on. I think this algorithm has no negative impact on filesystem perfomance. It works better than the default one in all cases. The new dirpref will greatly improve untarring/removing/coping of big directories, decrease load on cvs servers and much more. The new dirpref doesn't speedup a compilation process, but also doesn't slow it down. Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
* of pointers to blocks of struct csum; now there are just three
* pointers and the remaining space is padded with fs_ocsp[].
*
* NOCSPTRS determines the size of this padding. One pointer (fs_csp)
* is taken away to point to a contiguous array of struct csum for
* all cylinder groups; a second (fs_maxcluster) points to an array
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>. His description of the problem and solution follow. My own tests show speedups on typical filesystem intensive workloads of 5% to 12% which is very impressive considering the small amount of code change involved. ------ One day I noticed that some file operations run much faster on small file systems then on big ones. I've looked at the ffs algorithms, thought about them, and redesigned the dirpref algorithm. First I want to describe the results of my tests. These results are old and I have improved the algorithm after these tests were done. Nevertheless they show how big the perfomance speedup may be. I have done two file/directory intensive tests on a two OpenBSD systems with old and new dirpref algorithm. The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports". The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release. It contains 6596 directories and 13868 files. The test systems are: 1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for test is at wd1. Size of test file system is 8 Gb, number of cg=991, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=35 2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system at wd0, file system for test is at wd1. Size of test file system is 40 Gb, number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50 You can get more info about the test systems and methods at: http://www.ptci.ru/gluk/dirpref/old/dirpref.html Test Results tar -xzf ports.tar.gz rm -rf ports mode old dirpref new dirpref speedup old dirprefnew dirpref speedup First system normal 667 472 1.41 477 331 1.44 async 285 144 1.98 130 14 9.29 sync 768 616 1.25 477 334 1.43 softdep 413 252 1.64 241 38 6.34 Second system normal 329 81 4.06 263.5 93.5 2.81 async 302 25.7 11.75 112 2.26 49.56 sync 281 57.0 4.93 263 90.5 2.9 softdep 341 40.6 8.4 284 4.76 59.66 "old dirpref" and "new dirpref" columns give a test time in seconds. speedup - speed increasement in times, ie. old dirpref / new dirpref. ------ Algorithm description The old dirpref algorithm is described in comments: /* * Find a cylinder to place a directory. * * The policy implemented by this algorithm is to select from * among those cylinder groups with above the average number of * free inodes, the one with the smallest number of directories. */ A new directory is allocated in a different cylinder groups than its parent directory resulting in a directory tree that is spreaded across all the cylinder groups. This spreading out results in a non-optimal access to the directories and files. When we have a small filesystem it is not a problem but when the filesystem is big then perfomance degradation becomes very apparent. What I mean by a big file system ? 1. A big filesystem is a filesystem which occupy 20-30 or more percent of total drive space, i.e. first and last cylinder are physically located relatively far from each other. 2. It has a relatively large number of cylinder groups, for example more cylinder groups than 50% of the buffers in the buffer cache. The first results in long access times, while the second results in many buffers being used by metadata operations. Such operations use cylinder group blocks and on-disk inode blocks. The cylinder group block (fs->fs_cblkno) contains struct cg, inode and block bit maps. It is 2k in size for the default filesystem parameters. If new and parent directories are located in different cylinder groups then the system performs more input/output operations and uses more buffers. On filesystems with many cylinder groups, lots of cache buffers are used for metadata operations. My solution for this problem is very simple. I allocate many directories in one cylinder group. I also do some things, so that the new allocation method does not cause excessive fragmentation and all directory inodes will not be located at a location far from its file's inodes and data. The algorithm is: /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ My early versions of dirpref give me a good results for a wide range of file operations and different filesystem capacities except one case: those applications that create their entire directory structure first and only later fill this structure with files. My solution for such and similar cases is to limit a number of directories which may be created one after another in the same cylinder group without intervening file creations. For this purpose, I allocate an array of counters at mount time. This array is linked to the superblock fs->fs_contigdirs[cg]. Each time a directory is created the counter increases and each time a file is created the counter decreases. A 60Gb filesystem with 8mb/cg requires 10kb of memory for the counters array. The maxcontigdirs is a maximum number of directories which may be created without an intervening file creation. I found in my tests that the best performance occurs when I restrict the number of directories in one cylinder group such that all its files may be located in the same cylinder group. There may be some deterioration in performance if all the file inodes are in the same cylinder group as its containing directory, but their data partially resides in a different cylinder group. The maxcontigdirs value is calculated to try to prevent this condition. Since there is no way to know how many files and directories will be allocated later I added two optimization parameters in superblock/tunefs. They are: int32_t fs_avgfilesize; /* expected average file size */ int32_t fs_avgfpdir; /* expected # of files per directory */ These parameters have reasonable defaults but may be tweeked for special uses of a filesystem. They are only necessary in rare cases like better tuning a filesystem being used to store a squid cache. I have been using this algorithm for about 3 months. I have done a lot of testing on filesystems with different capacities, average filesize, average number of files per directory, and so on. I think this algorithm has no negative impact on filesystem perfomance. It works better than the default one in all cases. The new dirpref will greatly improve untarring/removing/coping of big directories, decrease load on cvs servers and much more. The new dirpref doesn't speedup a compilation process, but also doesn't slow it down. Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
* of cluster sizes that is computed as cylinder groups are inspected,
* and the third points to an array that tracks the creation of new
* directories.
1994-05-24 10:09:53 +00:00
*/
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>. His description of the problem and solution follow. My own tests show speedups on typical filesystem intensive workloads of 5% to 12% which is very impressive considering the small amount of code change involved. ------ One day I noticed that some file operations run much faster on small file systems then on big ones. I've looked at the ffs algorithms, thought about them, and redesigned the dirpref algorithm. First I want to describe the results of my tests. These results are old and I have improved the algorithm after these tests were done. Nevertheless they show how big the perfomance speedup may be. I have done two file/directory intensive tests on a two OpenBSD systems with old and new dirpref algorithm. The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports". The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release. It contains 6596 directories and 13868 files. The test systems are: 1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for test is at wd1. Size of test file system is 8 Gb, number of cg=991, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=35 2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system at wd0, file system for test is at wd1. Size of test file system is 40 Gb, number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50 You can get more info about the test systems and methods at: http://www.ptci.ru/gluk/dirpref/old/dirpref.html Test Results tar -xzf ports.tar.gz rm -rf ports mode old dirpref new dirpref speedup old dirprefnew dirpref speedup First system normal 667 472 1.41 477 331 1.44 async 285 144 1.98 130 14 9.29 sync 768 616 1.25 477 334 1.43 softdep 413 252 1.64 241 38 6.34 Second system normal 329 81 4.06 263.5 93.5 2.81 async 302 25.7 11.75 112 2.26 49.56 sync 281 57.0 4.93 263 90.5 2.9 softdep 341 40.6 8.4 284 4.76 59.66 "old dirpref" and "new dirpref" columns give a test time in seconds. speedup - speed increasement in times, ie. old dirpref / new dirpref. ------ Algorithm description The old dirpref algorithm is described in comments: /* * Find a cylinder to place a directory. * * The policy implemented by this algorithm is to select from * among those cylinder groups with above the average number of * free inodes, the one with the smallest number of directories. */ A new directory is allocated in a different cylinder groups than its parent directory resulting in a directory tree that is spreaded across all the cylinder groups. This spreading out results in a non-optimal access to the directories and files. When we have a small filesystem it is not a problem but when the filesystem is big then perfomance degradation becomes very apparent. What I mean by a big file system ? 1. A big filesystem is a filesystem which occupy 20-30 or more percent of total drive space, i.e. first and last cylinder are physically located relatively far from each other. 2. It has a relatively large number of cylinder groups, for example more cylinder groups than 50% of the buffers in the buffer cache. The first results in long access times, while the second results in many buffers being used by metadata operations. Such operations use cylinder group blocks and on-disk inode blocks. The cylinder group block (fs->fs_cblkno) contains struct cg, inode and block bit maps. It is 2k in size for the default filesystem parameters. If new and parent directories are located in different cylinder groups then the system performs more input/output operations and uses more buffers. On filesystems with many cylinder groups, lots of cache buffers are used for metadata operations. My solution for this problem is very simple. I allocate many directories in one cylinder group. I also do some things, so that the new allocation method does not cause excessive fragmentation and all directory inodes will not be located at a location far from its file's inodes and data. The algorithm is: /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ My early versions of dirpref give me a good results for a wide range of file operations and different filesystem capacities except one case: those applications that create their entire directory structure first and only later fill this structure with files. My solution for such and similar cases is to limit a number of directories which may be created one after another in the same cylinder group without intervening file creations. For this purpose, I allocate an array of counters at mount time. This array is linked to the superblock fs->fs_contigdirs[cg]. Each time a directory is created the counter increases and each time a file is created the counter decreases. A 60Gb filesystem with 8mb/cg requires 10kb of memory for the counters array. The maxcontigdirs is a maximum number of directories which may be created without an intervening file creation. I found in my tests that the best performance occurs when I restrict the number of directories in one cylinder group such that all its files may be located in the same cylinder group. There may be some deterioration in performance if all the file inodes are in the same cylinder group as its containing directory, but their data partially resides in a different cylinder group. The maxcontigdirs value is calculated to try to prevent this condition. Since there is no way to know how many files and directories will be allocated later I added two optimization parameters in superblock/tunefs. They are: int32_t fs_avgfilesize; /* expected average file size */ int32_t fs_avgfpdir; /* expected # of files per directory */ These parameters have reasonable defaults but may be tweeked for special uses of a filesystem. They are only necessary in rare cases like better tuning a filesystem being used to store a squid cache. I have been using this algorithm for about 3 months. I have done a lot of testing on filesystems with different capacities, average filesize, average number of files per directory, and so on. I think this algorithm has no negative impact on filesystem perfomance. It works better than the default one in all cases. The new dirpref will greatly improve untarring/removing/coping of big directories, decrease load on cvs servers and much more. The new dirpref doesn't speedup a compilation process, but also doesn't slow it down. Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
#define NOCSPTRS ((128 / sizeof(void *)) - 3)
1994-05-24 10:09:53 +00:00
/*
* A summary of contiguous blocks of various sizes is maintained
* in each cylinder group. Normally this is set by the initial
* value of fs_maxcontig. To conserve space, a maximum summary size
* is set by FS_MAXCONTIG.
*/
#define FS_MAXCONTIG 16
/*
* MINFREE gives the minimum acceptable percentage of file system
* blocks which may be free. If the freelist drops below this level
* only the superuser may continue to allocate blocks. This may
* be set to 0 if no reserve of free blocks is deemed necessary,
* however throughput drops by fifty percent if the file system
* is run at between 95% and 100% full; thus the minimum default
* value of fs_minfree is 5%. However, to get good clustering
* performance, 10% is a better choice. hence we use 10% as our
* default value. With 10% free space, fragmentation is not a
* problem, so we choose to optimize for time.
*/
1995-03-10 22:18:16 +00:00
#define MINFREE 8
1994-05-24 10:09:53 +00:00
#define DEFAULTOPT FS_OPTTIME
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>. His description of the problem and solution follow. My own tests show speedups on typical filesystem intensive workloads of 5% to 12% which is very impressive considering the small amount of code change involved. ------ One day I noticed that some file operations run much faster on small file systems then on big ones. I've looked at the ffs algorithms, thought about them, and redesigned the dirpref algorithm. First I want to describe the results of my tests. These results are old and I have improved the algorithm after these tests were done. Nevertheless they show how big the perfomance speedup may be. I have done two file/directory intensive tests on a two OpenBSD systems with old and new dirpref algorithm. The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports". The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release. It contains 6596 directories and 13868 files. The test systems are: 1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for test is at wd1. Size of test file system is 8 Gb, number of cg=991, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=35 2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system at wd0, file system for test is at wd1. Size of test file system is 40 Gb, number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50 You can get more info about the test systems and methods at: http://www.ptci.ru/gluk/dirpref/old/dirpref.html Test Results tar -xzf ports.tar.gz rm -rf ports mode old dirpref new dirpref speedup old dirprefnew dirpref speedup First system normal 667 472 1.41 477 331 1.44 async 285 144 1.98 130 14 9.29 sync 768 616 1.25 477 334 1.43 softdep 413 252 1.64 241 38 6.34 Second system normal 329 81 4.06 263.5 93.5 2.81 async 302 25.7 11.75 112 2.26 49.56 sync 281 57.0 4.93 263 90.5 2.9 softdep 341 40.6 8.4 284 4.76 59.66 "old dirpref" and "new dirpref" columns give a test time in seconds. speedup - speed increasement in times, ie. old dirpref / new dirpref. ------ Algorithm description The old dirpref algorithm is described in comments: /* * Find a cylinder to place a directory. * * The policy implemented by this algorithm is to select from * among those cylinder groups with above the average number of * free inodes, the one with the smallest number of directories. */ A new directory is allocated in a different cylinder groups than its parent directory resulting in a directory tree that is spreaded across all the cylinder groups. This spreading out results in a non-optimal access to the directories and files. When we have a small filesystem it is not a problem but when the filesystem is big then perfomance degradation becomes very apparent. What I mean by a big file system ? 1. A big filesystem is a filesystem which occupy 20-30 or more percent of total drive space, i.e. first and last cylinder are physically located relatively far from each other. 2. It has a relatively large number of cylinder groups, for example more cylinder groups than 50% of the buffers in the buffer cache. The first results in long access times, while the second results in many buffers being used by metadata operations. Such operations use cylinder group blocks and on-disk inode blocks. The cylinder group block (fs->fs_cblkno) contains struct cg, inode and block bit maps. It is 2k in size for the default filesystem parameters. If new and parent directories are located in different cylinder groups then the system performs more input/output operations and uses more buffers. On filesystems with many cylinder groups, lots of cache buffers are used for metadata operations. My solution for this problem is very simple. I allocate many directories in one cylinder group. I also do some things, so that the new allocation method does not cause excessive fragmentation and all directory inodes will not be located at a location far from its file's inodes and data. The algorithm is: /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ My early versions of dirpref give me a good results for a wide range of file operations and different filesystem capacities except one case: those applications that create their entire directory structure first and only later fill this structure with files. My solution for such and similar cases is to limit a number of directories which may be created one after another in the same cylinder group without intervening file creations. For this purpose, I allocate an array of counters at mount time. This array is linked to the superblock fs->fs_contigdirs[cg]. Each time a directory is created the counter increases and each time a file is created the counter decreases. A 60Gb filesystem with 8mb/cg requires 10kb of memory for the counters array. The maxcontigdirs is a maximum number of directories which may be created without an intervening file creation. I found in my tests that the best performance occurs when I restrict the number of directories in one cylinder group such that all its files may be located in the same cylinder group. There may be some deterioration in performance if all the file inodes are in the same cylinder group as its containing directory, but their data partially resides in a different cylinder group. The maxcontigdirs value is calculated to try to prevent this condition. Since there is no way to know how many files and directories will be allocated later I added two optimization parameters in superblock/tunefs. They are: int32_t fs_avgfilesize; /* expected average file size */ int32_t fs_avgfpdir; /* expected # of files per directory */ These parameters have reasonable defaults but may be tweeked for special uses of a filesystem. They are only necessary in rare cases like better tuning a filesystem being used to store a squid cache. I have been using this algorithm for about 3 months. I have done a lot of testing on filesystems with different capacities, average filesize, average number of files per directory, and so on. I think this algorithm has no negative impact on filesystem perfomance. It works better than the default one in all cases. The new dirpref will greatly improve untarring/removing/coping of big directories, decrease load on cvs servers and much more. The new dirpref doesn't speedup a compilation process, but also doesn't slow it down. Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
/*
* Grigoriy Orlov <gluk@ptci.ru> has done some extensive work to fine
* tune the layout preferences for directories within a filesystem.
* His algorithm can be tuned by adjusting the following parameters
* which tell the system the average file size and the average number
* of files per directory. These defaults are well selected for typical
* filesystems, but may need to be tuned for odd cases like filesystems
* being used for sqiud caches or news spools.
*/
#define AVFILESIZ 16384 /* expected average file size */
#define AFPDIR 64 /* expected number of files per directory */
/*
* The maximum number of snapshot nodes that can be associated
* with each filesystem. This limit affects only the number of
* snapshot files that can be recorded within the superblock so
* that they can be found when the filesystem is mounted. However,
* maintaining too many will slow the filesystem performance, so
* having this limit is a good idea.
*/
#define FSMAXSNAP 20
/*
* Used to identify special blocks in snapshots:
*
* BLK_NOCOPY - A block that was unallocated at the time the snapshot
* was taken, hence does not need to be copied when written.
* BLK_SNAP - A block held by another snapshot that is not needed by this
* snapshot. When the other snapshot is freed, the BLK_SNAP entries
* are converted to BLK_NOCOPY. These are needed to allow fsck to
* identify blocks that are in use by other snapshots (which are
* expunged from this snapshot).
*/
#define BLK_NOCOPY ((ufs_daddr_t)(1))
#define BLK_SNAP ((ufs_daddr_t)(2))
/*
* Sysctl values for the fast filesystem.
*/
#define FFS_ADJ_REFCNT 1 /* adjust inode reference count */
#define FFS_ADJ_BLKCNT 2 /* adjust inode used block count */
#define FFS_BLK_FREE 3 /* free range of blocks in map */
#define FFS_DIR_FREE 4 /* free specified dir inodes in map */
#define FFS_FILE_FREE 5 /* free specified file inodes in map */
#define FFS_SET_FLAGS 6 /* set filesystem flags */
#define FFS_MAXID 7 /* number of valid ffs ids */
/*
* Command structure passed in to the filesystem to adjust filesystem values.
*/
#define FFS_CMD_VERSION 0x05181979 /* version ID */
struct fsck_cmd {
int version; /* version of command structure */
int handle; /* reference to filesystem to be changed */
off_t value; /* inode or block number to be affected */
long size; /* amount or range to be adjusted */
};
1994-05-24 10:09:53 +00:00
/*
* Per cylinder group information; summarized in blocks allocated
* from first cylinder group data blocks. These blocks have to be
* read in from fs_csaddr (size fs_cssize) in addition to the
* super block.
*/
struct csum {
int32_t cs_ndir; /* number of directories */
int32_t cs_nbfree; /* number of free blocks */
int32_t cs_nifree; /* number of free inodes */
int32_t cs_nffree; /* number of free frags */
1994-05-24 10:09:53 +00:00
};
/*
* Super block for an FFS file system.
1994-05-24 10:09:53 +00:00
*/
struct fs {
int32_t fs_firstfield; /* historic file system linked list, */
int32_t fs_unused_1; /* used for incore super blocks */
ufs_daddr_t fs_sblkno; /* addr of super-block in filesys */
ufs_daddr_t fs_cblkno; /* offset of cyl-block in filesys */
ufs_daddr_t fs_iblkno; /* offset of inode-blocks in filesys */
ufs_daddr_t fs_dblkno; /* offset of first data after cg */
int32_t fs_cgoffset; /* cylinder group offset in cylinder */
int32_t fs_cgmask; /* used to calc mod fs_ntrak */
ufs_time_t fs_time; /* last time written */
int32_t fs_size; /* number of blocks in fs */
int32_t fs_dsize; /* number of data blocks in fs */
int32_t fs_ncg; /* number of cylinder groups */
int32_t fs_bsize; /* size of basic blocks in fs */
int32_t fs_fsize; /* size of frag blocks in fs */
int32_t fs_frag; /* number of frags in a block in fs */
1994-05-24 10:09:53 +00:00
/* these are configuration parameters */
int32_t fs_minfree; /* minimum percentage of free blocks */
int32_t fs_rotdelay; /* num of ms for optimal next block */
int32_t fs_rps; /* disk revolutions per second */
1994-05-24 10:09:53 +00:00
/* these fields can be computed from the others */
int32_t fs_bmask; /* ``blkoff'' calc of blk offsets */
int32_t fs_fmask; /* ``fragoff'' calc of frag offsets */
int32_t fs_bshift; /* ``lblkno'' calc of logical blkno */
int32_t fs_fshift; /* ``numfrags'' calc number of frags */
1994-05-24 10:09:53 +00:00
/* these are configuration parameters */
int32_t fs_maxcontig; /* max number of contiguous blks */
int32_t fs_maxbpg; /* max number of blks per cyl group */
1994-05-24 10:09:53 +00:00
/* these fields can be computed from the others */
int32_t fs_fragshift; /* block to frag shift */
int32_t fs_fsbtodb; /* fsbtodb and dbtofsb shift constant */
int32_t fs_sbsize; /* actual size of super block */
int32_t fs_csmask; /* csum block offset (now unused) */
int32_t fs_csshift; /* csum block number (now unused) */
int32_t fs_nindir; /* value of NINDIR */
int32_t fs_inopb; /* value of INOPB */
int32_t fs_nspf; /* value of NSPF */
1994-05-24 10:09:53 +00:00
/* yet another configuration parameter */
int32_t fs_optim; /* optimization preference, see below */
1994-05-24 10:09:53 +00:00
/* these fields are derived from the hardware */
int32_t fs_npsect; /* # sectors/track including spares */
int32_t fs_interleave; /* hardware sector interleave */
int32_t fs_trackskew; /* sector 0 skew, per track */
/* fs_id takes the space of the unused fs_headswitch and fs_trkseek fields */
int32_t fs_id[2]; /* unique filesystem id */
1994-05-24 10:09:53 +00:00
/* sizes determined by number of cylinder groups and their sizes */
ufs_daddr_t fs_csaddr; /* blk addr of cyl grp summary area */
int32_t fs_cssize; /* size of cyl grp summary area */
int32_t fs_cgsize; /* cylinder group size */
1994-05-24 10:09:53 +00:00
/* these fields are derived from the hardware */
int32_t fs_ntrak; /* tracks per cylinder */
int32_t fs_nsect; /* sectors per track */
int32_t fs_spc; /* sectors per cylinder */
1994-05-24 10:09:53 +00:00
/* this comes from the disk driver partitioning */
int32_t fs_ncyl; /* cylinders in file system */
1994-05-24 10:09:53 +00:00
/* these fields can be computed from the others */
int32_t fs_cpg; /* cylinders per group */
int32_t fs_ipg; /* inodes per group */
int32_t fs_fpg; /* blocks per group * fs_frag */
1994-05-24 10:09:53 +00:00
/* this data must be re-computed after crashes */
struct csum fs_cstotal; /* cylinder summary information */
/* these fields are cleared at mount time */
int8_t fs_fmod; /* super block modified flag */
int8_t fs_clean; /* file system is clean flag */
int8_t fs_ronly; /* mounted read-only flag */
int8_t fs_flags; /* see FS_ flags below */
u_char fs_fsmnt[MAXMNTLEN]; /* name mounted on */
1994-05-24 10:09:53 +00:00
/* these fields retain the current block allocation info */
int32_t fs_cgrotor; /* last cg searched */
void *fs_ocsp[NOCSPTRS]; /* padding; was list of fs_cs buffers */
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>. His description of the problem and solution follow. My own tests show speedups on typical filesystem intensive workloads of 5% to 12% which is very impressive considering the small amount of code change involved. ------ One day I noticed that some file operations run much faster on small file systems then on big ones. I've looked at the ffs algorithms, thought about them, and redesigned the dirpref algorithm. First I want to describe the results of my tests. These results are old and I have improved the algorithm after these tests were done. Nevertheless they show how big the perfomance speedup may be. I have done two file/directory intensive tests on a two OpenBSD systems with old and new dirpref algorithm. The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports". The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release. It contains 6596 directories and 13868 files. The test systems are: 1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for test is at wd1. Size of test file system is 8 Gb, number of cg=991, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=35 2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system at wd0, file system for test is at wd1. Size of test file system is 40 Gb, number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50 You can get more info about the test systems and methods at: http://www.ptci.ru/gluk/dirpref/old/dirpref.html Test Results tar -xzf ports.tar.gz rm -rf ports mode old dirpref new dirpref speedup old dirprefnew dirpref speedup First system normal 667 472 1.41 477 331 1.44 async 285 144 1.98 130 14 9.29 sync 768 616 1.25 477 334 1.43 softdep 413 252 1.64 241 38 6.34 Second system normal 329 81 4.06 263.5 93.5 2.81 async 302 25.7 11.75 112 2.26 49.56 sync 281 57.0 4.93 263 90.5 2.9 softdep 341 40.6 8.4 284 4.76 59.66 "old dirpref" and "new dirpref" columns give a test time in seconds. speedup - speed increasement in times, ie. old dirpref / new dirpref. ------ Algorithm description The old dirpref algorithm is described in comments: /* * Find a cylinder to place a directory. * * The policy implemented by this algorithm is to select from * among those cylinder groups with above the average number of * free inodes, the one with the smallest number of directories. */ A new directory is allocated in a different cylinder groups than its parent directory resulting in a directory tree that is spreaded across all the cylinder groups. This spreading out results in a non-optimal access to the directories and files. When we have a small filesystem it is not a problem but when the filesystem is big then perfomance degradation becomes very apparent. What I mean by a big file system ? 1. A big filesystem is a filesystem which occupy 20-30 or more percent of total drive space, i.e. first and last cylinder are physically located relatively far from each other. 2. It has a relatively large number of cylinder groups, for example more cylinder groups than 50% of the buffers in the buffer cache. The first results in long access times, while the second results in many buffers being used by metadata operations. Such operations use cylinder group blocks and on-disk inode blocks. The cylinder group block (fs->fs_cblkno) contains struct cg, inode and block bit maps. It is 2k in size for the default filesystem parameters. If new and parent directories are located in different cylinder groups then the system performs more input/output operations and uses more buffers. On filesystems with many cylinder groups, lots of cache buffers are used for metadata operations. My solution for this problem is very simple. I allocate many directories in one cylinder group. I also do some things, so that the new allocation method does not cause excessive fragmentation and all directory inodes will not be located at a location far from its file's inodes and data. The algorithm is: /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ My early versions of dirpref give me a good results for a wide range of file operations and different filesystem capacities except one case: those applications that create their entire directory structure first and only later fill this structure with files. My solution for such and similar cases is to limit a number of directories which may be created one after another in the same cylinder group without intervening file creations. For this purpose, I allocate an array of counters at mount time. This array is linked to the superblock fs->fs_contigdirs[cg]. Each time a directory is created the counter increases and each time a file is created the counter decreases. A 60Gb filesystem with 8mb/cg requires 10kb of memory for the counters array. The maxcontigdirs is a maximum number of directories which may be created without an intervening file creation. I found in my tests that the best performance occurs when I restrict the number of directories in one cylinder group such that all its files may be located in the same cylinder group. There may be some deterioration in performance if all the file inodes are in the same cylinder group as its containing directory, but their data partially resides in a different cylinder group. The maxcontigdirs value is calculated to try to prevent this condition. Since there is no way to know how many files and directories will be allocated later I added two optimization parameters in superblock/tunefs. They are: int32_t fs_avgfilesize; /* expected average file size */ int32_t fs_avgfpdir; /* expected # of files per directory */ These parameters have reasonable defaults but may be tweeked for special uses of a filesystem. They are only necessary in rare cases like better tuning a filesystem being used to store a squid cache. I have been using this algorithm for about 3 months. I have done a lot of testing on filesystems with different capacities, average filesize, average number of files per directory, and so on. I think this algorithm has no negative impact on filesystem perfomance. It works better than the default one in all cases. The new dirpref will greatly improve untarring/removing/coping of big directories, decrease load on cvs servers and much more. The new dirpref doesn't speedup a compilation process, but also doesn't slow it down. Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
u_int8_t *fs_contigdirs; /* # of contiguously allocated dirs */
struct csum *fs_csp; /* cg summary info buffer for fs_cs */
int32_t *fs_maxcluster; /* max cluster in each cyl group */
int32_t fs_cpc; /* cyl per cycle in postbl */
int16_t fs_opostbl[16][8]; /* old rotation block list head */
int32_t fs_snapinum[FSMAXSNAP];/* list of snapshot inode numbers */
Directory layout preference improvements from Grigoriy Orlov <gluk@ptci.ru>. His description of the problem and solution follow. My own tests show speedups on typical filesystem intensive workloads of 5% to 12% which is very impressive considering the small amount of code change involved. ------ One day I noticed that some file operations run much faster on small file systems then on big ones. I've looked at the ffs algorithms, thought about them, and redesigned the dirpref algorithm. First I want to describe the results of my tests. These results are old and I have improved the algorithm after these tests were done. Nevertheless they show how big the perfomance speedup may be. I have done two file/directory intensive tests on a two OpenBSD systems with old and new dirpref algorithm. The first test is "tar -xzf ports.tar.gz", the second is "rm -rf ports". The ports.tar.gz file is the ports collection from the OpenBSD 2.8 release. It contains 6596 directories and 13868 files. The test systems are: 1. Celeron-450, 128Mb, two IDE drives, the system at wd0, file system for test is at wd1. Size of test file system is 8 Gb, number of cg=991, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=35 2. PIII-600, 128Mb, two IBM DTLA-307045 IDE drives at i815e, the system at wd0, file system for test is at wd1. Size of test file system is 40 Gb, number of cg=5324, size of cg is 8m, block size = 8k, fragment size = 1k OpenBSD-current from Dec 2000 with BUFCACHEPERCENT=50 You can get more info about the test systems and methods at: http://www.ptci.ru/gluk/dirpref/old/dirpref.html Test Results tar -xzf ports.tar.gz rm -rf ports mode old dirpref new dirpref speedup old dirprefnew dirpref speedup First system normal 667 472 1.41 477 331 1.44 async 285 144 1.98 130 14 9.29 sync 768 616 1.25 477 334 1.43 softdep 413 252 1.64 241 38 6.34 Second system normal 329 81 4.06 263.5 93.5 2.81 async 302 25.7 11.75 112 2.26 49.56 sync 281 57.0 4.93 263 90.5 2.9 softdep 341 40.6 8.4 284 4.76 59.66 "old dirpref" and "new dirpref" columns give a test time in seconds. speedup - speed increasement in times, ie. old dirpref / new dirpref. ------ Algorithm description The old dirpref algorithm is described in comments: /* * Find a cylinder to place a directory. * * The policy implemented by this algorithm is to select from * among those cylinder groups with above the average number of * free inodes, the one with the smallest number of directories. */ A new directory is allocated in a different cylinder groups than its parent directory resulting in a directory tree that is spreaded across all the cylinder groups. This spreading out results in a non-optimal access to the directories and files. When we have a small filesystem it is not a problem but when the filesystem is big then perfomance degradation becomes very apparent. What I mean by a big file system ? 1. A big filesystem is a filesystem which occupy 20-30 or more percent of total drive space, i.e. first and last cylinder are physically located relatively far from each other. 2. It has a relatively large number of cylinder groups, for example more cylinder groups than 50% of the buffers in the buffer cache. The first results in long access times, while the second results in many buffers being used by metadata operations. Such operations use cylinder group blocks and on-disk inode blocks. The cylinder group block (fs->fs_cblkno) contains struct cg, inode and block bit maps. It is 2k in size for the default filesystem parameters. If new and parent directories are located in different cylinder groups then the system performs more input/output operations and uses more buffers. On filesystems with many cylinder groups, lots of cache buffers are used for metadata operations. My solution for this problem is very simple. I allocate many directories in one cylinder group. I also do some things, so that the new allocation method does not cause excessive fragmentation and all directory inodes will not be located at a location far from its file's inodes and data. The algorithm is: /* * Find a cylinder group to place a directory. * * The policy implemented by this algorithm is to allocate a * directory inode in the same cylinder group as its parent * directory, but also to reserve space for its files inodes * and data. Restrict the number of directories which may be * allocated one after another in the same cylinder group * without intervening allocation of files. * * If we allocate a first level directory then force allocation * in another cylinder group. */ My early versions of dirpref give me a good results for a wide range of file operations and different filesystem capacities except one case: those applications that create their entire directory structure first and only later fill this structure with files. My solution for such and similar cases is to limit a number of directories which may be created one after another in the same cylinder group without intervening file creations. For this purpose, I allocate an array of counters at mount time. This array is linked to the superblock fs->fs_contigdirs[cg]. Each time a directory is created the counter increases and each time a file is created the counter decreases. A 60Gb filesystem with 8mb/cg requires 10kb of memory for the counters array. The maxcontigdirs is a maximum number of directories which may be created without an intervening file creation. I found in my tests that the best performance occurs when I restrict the number of directories in one cylinder group such that all its files may be located in the same cylinder group. There may be some deterioration in performance if all the file inodes are in the same cylinder group as its containing directory, but their data partially resides in a different cylinder group. The maxcontigdirs value is calculated to try to prevent this condition. Since there is no way to know how many files and directories will be allocated later I added two optimization parameters in superblock/tunefs. They are: int32_t fs_avgfilesize; /* expected average file size */ int32_t fs_avgfpdir; /* expected # of files per directory */ These parameters have reasonable defaults but may be tweeked for special uses of a filesystem. They are only necessary in rare cases like better tuning a filesystem being used to store a squid cache. I have been using this algorithm for about 3 months. I have done a lot of testing on filesystems with different capacities, average filesize, average number of files per directory, and so on. I think this algorithm has no negative impact on filesystem perfomance. It works better than the default one in all cases. The new dirpref will greatly improve untarring/removing/coping of big directories, decrease load on cvs servers and much more. The new dirpref doesn't speedup a compilation process, but also doesn't slow it down. Obtained from: Grigoriy Orlov <gluk@ptci.ru>
2001-04-10 08:38:59 +00:00
int32_t fs_avgfilesize; /* expected average file size */
int32_t fs_avgfpdir; /* expected # of files per directory */
int32_t fs_sparecon[26]; /* reserved for future constants */
int32_t fs_pendingblocks; /* blocks in process of being freed */
int32_t fs_pendinginodes; /* inodes in process of being freed */
int32_t fs_contigsumsize; /* size of cluster summary array */
int32_t fs_maxsymlinklen; /* max length of an internal symlink */
int32_t fs_inodefmt; /* format of on-disk inodes */
u_int64_t fs_maxfilesize; /* maximum representable file size */
int64_t fs_qbmask; /* ~fs_bmask for use with 64-bit size */
int64_t fs_qfmask; /* ~fs_fmask for use with 64-bit size */
int32_t fs_state; /* validate fs_clean field */
int32_t fs_postblformat; /* format of positional layout tables */
int32_t fs_nrpos; /* number of rotational positions */
int32_t fs_postbloff; /* (u_int16) rotation block list head */
int32_t fs_rotbloff; /* (u_int8) blocks for each rotation */
int32_t fs_magic; /* magic number */
u_int8_t fs_space[1]; /* list of blocks for each rotation */
1994-05-24 10:09:53 +00:00
/* actually longer */
};
1994-05-24 10:09:53 +00:00
/*
* Filesystem identification
1994-05-24 10:09:53 +00:00
*/
#define FS_MAGIC 0x011954 /* the fast filesystem magic number */
#define FS_OKAY 0x7c269d38 /* superblock checksum */
#define FS_42INODEFMT -1 /* 4.2BSD inode format */
#define FS_44INODEFMT 2 /* 4.4BSD inode format */
1994-05-24 10:09:53 +00:00
/*
* Preference for optimization.
*/
#define FS_OPTTIME 0 /* minimize allocation time */
#define FS_OPTSPACE 1 /* minimize disk fragmentation */
/*
* Filesystem flags.
*
* Note that the FS_NEEDSFSCK flag is set and cleared only by the
* fsck utility. It is set when background fsck finds an unexpected
* inconsistency which requires a traditional foreground fsck to be
* run. Such inconsistencies should only be found after an uncorrectable
* disk error. A foreground fsck will clear the FS_NEEDSFSCK flag when
* it has successfully cleaned up the filesystem. The kernel uses this
* flag to enforce that inconsistent filesystems be mounted read-only.
*/
#define FS_UNCLEAN 0x01 /* filesystem not clean at mount */
#define FS_DOSOFTDEP 0x02 /* filesystem using soft dependencies */
#define FS_NEEDSFSCK 0x04 /* filesystem needs sync fsck before mount */
1994-05-24 10:09:53 +00:00
/*
* Rotational layout table format types
*/
#define FS_42POSTBLFMT -1 /* 4.2BSD rotational table format */
#define FS_DYNAMICPOSTBLFMT 1 /* dynamic rotational table format */
/*
* Macros for access to superblock array structures
*/
#define fs_postbl(fs, cylno) \
(((fs)->fs_postblformat == FS_42POSTBLFMT) \
? ((fs)->fs_opostbl[cylno]) \
: ((int16_t *)((u_int8_t *)(fs) + \
(fs)->fs_postbloff) + (cylno) * (fs)->fs_nrpos))
1994-05-24 10:09:53 +00:00
#define fs_rotbl(fs) \
(((fs)->fs_postblformat == FS_42POSTBLFMT) \
? ((fs)->fs_space) \
: ((u_int8_t *)((u_int8_t *)(fs) + (fs)->fs_rotbloff)))
1994-05-24 10:09:53 +00:00
/*
* The size of a cylinder group is calculated by CGSIZE. The maximum size
* is limited by the fact that cylinder groups are at most one block.
1995-05-30 08:16:23 +00:00
* Its size is derived from the size of the maps maintained in the
1994-05-24 10:09:53 +00:00
* cylinder group and the (struct cg) size.
*/
#define CGSIZE(fs) \
/* base cg */ (sizeof(struct cg) + sizeof(int32_t) + \
/* blktot size */ (fs)->fs_cpg * sizeof(int32_t) + \
/* blks size */ (fs)->fs_cpg * (fs)->fs_nrpos * sizeof(int16_t) + \
1994-05-24 10:09:53 +00:00
/* inode map */ howmany((fs)->fs_ipg, NBBY) + \
/* block map */ howmany((fs)->fs_cpg * (fs)->fs_spc / NSPF(fs), NBBY) +\
/* if present */ ((fs)->fs_contigsumsize <= 0 ? 0 : \
/* cluster sum */ (fs)->fs_contigsumsize * sizeof(int32_t) + \
1994-05-24 10:09:53 +00:00
/* cluster map */ howmany((fs)->fs_cpg * (fs)->fs_spc / NSPB(fs), NBBY)))
/*
* Convert cylinder group to base address of its global summary info.
*/
#define fs_cs(fs, indx) fs_csp[indx]
1994-05-24 10:09:53 +00:00
/*
* Cylinder group block for a file system.
*/
#define CG_MAGIC 0x090255
struct cg {
int32_t cg_firstfield; /* historic cyl groups linked list */
int32_t cg_magic; /* magic number */
ufs_time_t cg_time; /* time last written */
int32_t cg_cgx; /* we are the cgx'th cylinder group */
int16_t cg_ncyl; /* number of cyl's this cg */
int16_t cg_niblk; /* number of inode blocks this cg */
int32_t cg_ndblk; /* number of data blocks this cg */
1994-05-24 10:09:53 +00:00
struct csum cg_cs; /* cylinder summary information */
int32_t cg_rotor; /* position of last used block */
int32_t cg_frotor; /* position of last used frag */
int32_t cg_irotor; /* position of last used inode */
int32_t cg_frsum[MAXFRAG]; /* counts of available frags */
int32_t cg_btotoff; /* (int32) block totals per cylinder */
int32_t cg_boff; /* (u_int16) free block positions */
int32_t cg_iusedoff; /* (u_int8) used inode map */
int32_t cg_freeoff; /* (u_int8) free block map */
int32_t cg_nextfreeoff; /* (u_int8) next available space */
int32_t cg_clustersumoff; /* (u_int32) counts of avail clusters */
int32_t cg_clusteroff; /* (u_int8) free cluster map */
int32_t cg_nclusterblks; /* number of clusters this cg */
int32_t cg_sparecon[13]; /* reserved for future use */
u_int8_t cg_space[1]; /* space for cylinder group maps */
1994-05-24 10:09:53 +00:00
/* actually longer */
};
1994-05-24 10:09:53 +00:00
/*
* Macros for access to cylinder group array structures
*/
#define cg_blktot(cgp) \
(((cgp)->cg_magic != CG_MAGIC) \
? (((struct ocg *)(cgp))->cg_btot) \
: ((int32_t *)((u_int8_t *)(cgp) + (cgp)->cg_btotoff)))
1994-05-24 10:09:53 +00:00
#define cg_blks(fs, cgp, cylno) \
(((cgp)->cg_magic != CG_MAGIC) \
? (((struct ocg *)(cgp))->cg_b[cylno]) \
: ((int16_t *)((u_int8_t *)(cgp) + \
(cgp)->cg_boff) + (cylno) * (fs)->fs_nrpos))
1994-05-24 10:09:53 +00:00
#define cg_inosused(cgp) \
(((cgp)->cg_magic != CG_MAGIC) \
? (((struct ocg *)(cgp))->cg_iused) \
: ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_iusedoff)))
1994-05-24 10:09:53 +00:00
#define cg_blksfree(cgp) \
(((cgp)->cg_magic != CG_MAGIC) \
? (((struct ocg *)(cgp))->cg_free) \
: ((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_freeoff)))
1994-05-24 10:09:53 +00:00
#define cg_chkmagic(cgp) \
((cgp)->cg_magic == CG_MAGIC || ((struct ocg *)(cgp))->cg_magic == CG_MAGIC)
#define cg_clustersfree(cgp) \
((u_int8_t *)((u_int8_t *)(cgp) + (cgp)->cg_clusteroff))
1994-05-24 10:09:53 +00:00
#define cg_clustersum(cgp) \
((int32_t *)((u_int8_t *)(cgp) + (cgp)->cg_clustersumoff))
1994-05-24 10:09:53 +00:00
/*
* The following structure is defined
* for compatibility with old file systems.
*/
struct ocg {
int32_t cg_firstfield; /* historic linked list of cyl groups */
int32_t cg_unused_1; /* used for incore cyl groups */
ufs_time_t cg_time; /* time last written */
int32_t cg_cgx; /* we are the cgx'th cylinder group */
int16_t cg_ncyl; /* number of cyl's this cg */
int16_t cg_niblk; /* number of inode blocks this cg */
int32_t cg_ndblk; /* number of data blocks this cg */
1994-05-24 10:09:53 +00:00
struct csum cg_cs; /* cylinder summary information */
int32_t cg_rotor; /* position of last used block */
int32_t cg_frotor; /* position of last used frag */
int32_t cg_irotor; /* position of last used inode */
int32_t cg_frsum[8]; /* counts of available frags */
int32_t cg_btot[32]; /* block totals per cylinder */
int16_t cg_b[32][8]; /* positions of free blocks */
u_int8_t cg_iused[256]; /* used inode map */
int32_t cg_magic; /* magic number */
u_int8_t cg_free[1]; /* free block map */
1994-05-24 10:09:53 +00:00
/* actually longer */
};
/*
* Turn file system block numbers into disk block addresses.
* This maps file system blocks to device size blocks.
*/
#define fsbtodb(fs, b) ((b) << (fs)->fs_fsbtodb)
#define dbtofsb(fs, b) ((b) >> (fs)->fs_fsbtodb)
/*
* Cylinder group macros to locate things in cylinder groups.
* They calc file system addresses of cylinder group data structures.
*/
#define cgbase(fs, c) ((ufs_daddr_t)((fs)->fs_fpg * (c)))
1994-05-24 10:09:53 +00:00
#define cgdmin(fs, c) (cgstart(fs, c) + (fs)->fs_dblkno) /* 1st data */
#define cgimin(fs, c) (cgstart(fs, c) + (fs)->fs_iblkno) /* inode blk */
#define cgsblock(fs, c) (cgstart(fs, c) + (fs)->fs_sblkno) /* super blk */
#define cgtod(fs, c) (cgstart(fs, c) + (fs)->fs_cblkno) /* cg block */
#define cgstart(fs, c) \
(cgbase(fs, c) + (fs)->fs_cgoffset * ((c) & ~((fs)->fs_cgmask)))
/*
* Macros for handling inode numbers:
* inode number to file system block offset.
* inode number to cylinder group number.
* inode number to file system block address.
*/
#define ino_to_cg(fs, x) ((x) / (fs)->fs_ipg)
#define ino_to_fsba(fs, x) \
((ufs_daddr_t)(cgimin(fs, ino_to_cg(fs, x)) + \
1994-05-24 10:09:53 +00:00
(blkstofrags((fs), (((x) % (fs)->fs_ipg) / INOPB(fs))))))
#define ino_to_fsbo(fs, x) ((x) % INOPB(fs))
/*
* Give cylinder group number for a file system block.
* Give cylinder group block number for a file system block.
*/
#define dtog(fs, d) ((d) / (fs)->fs_fpg)
#define dtogd(fs, d) ((d) % (fs)->fs_fpg)
/*
* Extract the bits for a block from a map.
* Compute the cylinder and rotational position of a cyl block addr.
*/
#define blkmap(fs, map, loc) \
(((map)[(loc) / NBBY] >> ((loc) % NBBY)) & (0xff >> (NBBY - (fs)->fs_frag)))
#define cbtocylno(fs, bno) \
((bno) * NSPF(fs) / (fs)->fs_spc)
#define cbtorpos(fs, bno) \
(((bno) * NSPF(fs) % (fs)->fs_spc / (fs)->fs_nsect * (fs)->fs_trackskew + \
(bno) * NSPF(fs) % (fs)->fs_spc % (fs)->fs_nsect * (fs)->fs_interleave) % \
(fs)->fs_nsect * (fs)->fs_nrpos / (fs)->fs_npsect)
/*
* The following macros optimize certain frequently calculated
* quantities by using shifts and masks in place of divisions
* modulos and multiplications.
*/
#define blkoff(fs, loc) /* calculates (loc % fs->fs_bsize) */ \
((loc) & (fs)->fs_qbmask)
#define fragoff(fs, loc) /* calculates (loc % fs->fs_fsize) */ \
((loc) & (fs)->fs_qfmask)
#define lblktosize(fs, blk) /* calculates ((off_t)blk * fs->fs_bsize) */ \
((off_t)(blk) << (fs)->fs_bshift)
/* Use this only when `blk' is known to be small, e.g., < NDADDR. */
#define smalllblktosize(fs, blk) /* calculates (blk * fs->fs_bsize) */ \
1994-05-24 10:09:53 +00:00
((blk) << (fs)->fs_bshift)
#define lblkno(fs, loc) /* calculates (loc / fs->fs_bsize) */ \
((loc) >> (fs)->fs_bshift)
#define numfrags(fs, loc) /* calculates (loc / fs->fs_fsize) */ \
((loc) >> (fs)->fs_fshift)
#define blkroundup(fs, size) /* calculates roundup(size, fs->fs_bsize) */ \
(((size) + (fs)->fs_qbmask) & (fs)->fs_bmask)
#define fragroundup(fs, size) /* calculates roundup(size, fs->fs_fsize) */ \
(((size) + (fs)->fs_qfmask) & (fs)->fs_fmask)
#define fragstoblks(fs, frags) /* calculates (frags / fs->fs_frag) */ \
((frags) >> (fs)->fs_fragshift)
#define blkstofrags(fs, blks) /* calculates (blks * fs->fs_frag) */ \
((blks) << (fs)->fs_fragshift)
#define fragnum(fs, fsb) /* calculates (fsb % fs->fs_frag) */ \
((fsb) & ((fs)->fs_frag - 1))
#define blknum(fs, fsb) /* calculates rounddown(fsb, fs->fs_frag) */ \
((fsb) &~ ((fs)->fs_frag - 1))
/*
* Determine the number of available frags given a
* percentage to hold in reserve.
1994-05-24 10:09:53 +00:00
*/
#define freespace(fs, percentreserved) \
(blkstofrags((fs), (fs)->fs_cstotal.cs_nbfree) + \
(fs)->fs_cstotal.cs_nffree - \
((off_t)((fs)->fs_dsize) * (percentreserved) / 100))
1994-05-24 10:09:53 +00:00
/*
* Determining the size of a file block in the file system.
*/
#define blksize(fs, ip, lbn) \
(((lbn) >= NDADDR || (ip)->i_size >= smalllblktosize(fs, (lbn) + 1)) \
1994-05-24 10:09:53 +00:00
? (fs)->fs_bsize \
: (fragroundup(fs, blkoff(fs, (ip)->i_size))))
#define dblksize(fs, dip, lbn) \
(((lbn) >= NDADDR || \
(dip)->di_size >= (u_int64_t)smalllblktosize(fs, (lbn) + 1)) \
1994-05-24 10:09:53 +00:00
? (fs)->fs_bsize \
: (fragroundup(fs, blkoff(fs, (dip)->di_size))))
#define sblksize(fs, size, lbn) \
(((lbn) >= NDADDR || (size) >= ((lbn) + 1) << (fs)->fs_bshift) \
? (fs)->fs_bsize \
: (fragroundup(fs, blkoff(fs, (size)))))
1994-05-24 10:09:53 +00:00
/*
* Number of disk sectors per block/fragment; assumes DEV_BSIZE byte
* sector size.
1994-05-24 10:09:53 +00:00
*/
#define NSPB(fs) ((fs)->fs_nspf << (fs)->fs_fragshift)
#define NSPF(fs) ((fs)->fs_nspf)
/*
* Number of inodes in a secondary storage block/fragment.
1994-05-24 10:09:53 +00:00
*/
#define INOPB(fs) ((fs)->fs_inopb)
#define INOPF(fs) ((fs)->fs_inopb >> (fs)->fs_fragshift)
/*
* Number of indirects in a file system block.
1994-05-24 10:09:53 +00:00
*/
#define NINDIR(fs) ((fs)->fs_nindir)
extern int inside[], around[];
extern u_char *fragtbl[];
#endif