makefs: Add ZFS support

This allows one to take a staged directory tree and create a file
consisting of a ZFS pool with one or more datasets that contain the
contents of the directory tree.  This is useful for creating virtual
machine images without using the kernel to create a pool; "zpool create"
requires root privileges and currently is not permitted in jails.
makefs -t zfs also provides reproducible images by using a fixed seed
for pseudo-random number generation, used for generating GUIDs and hash
salts.  makefs -t zfs requires relatively little by way of machine
resources.

The "zpool_reguid" rc.conf setting can be used to ask a FreeBSD guest to
generate a unique pool GUID upon first boot.

A small number of pool and dataset properties are supported.  The pool
is backed by a single disk vdev.  Data is always checksummed using
Fletcher-4, no redundant copies are made, and no compression is used.
The manual page documents supported pool and filesystem properties.

The implementation uses a few pieces of ZFS support from with the boot
loader, especially definitions for various on-disk structures, but is
otherwise standalone and in particular doesn't depend on OpenZFS.

This feature should be treated as experimental for now, i.e., important
data shouldn't be trusted to a makefs-created pool, and the command-line
interface is subject to change.

Sponsored by:	The FreeBSD Foundation
Differential Revision:	https://reviews.freebsd.org/D35248
This commit is contained in:
Mark Johnston 2022-08-05 13:07:54 -04:00
parent 3e1101f29b
commit 240afd8c1f
14 changed files with 4509 additions and 3 deletions

View File

@ -19,6 +19,17 @@ MAN= makefs.8
NO_WCAST_ALIGN=
CSTD= c11
.if ${MK_ZFS} != "no"
SRCS+= zfs.c
CFLAGS+=-I${SRCDIR}/zfs \
-I${SRCTOP}/stand/libsa \
-I${SRCTOP}/sys/cddl/boot
CFLAGS+= -DHAVE_ZFS
.include "${SRCDIR}/zfs/Makefile.inc"
.endif
.include "${SRCDIR}/cd9660/Makefile.inc"
.include "${SRCDIR}/ffs/Makefile.inc"
.include "${SRCDIR}/msdos/Makefile.inc"

View File

@ -35,7 +35,7 @@
.\"
.\" $FreeBSD$
.\"
.Dd September 17, 2020
.Dd August 5, 2022
.Dt MAKEFS 8
.Os
.Sh NAME
@ -266,6 +266,8 @@ BSD fast file system (default).
ISO 9660 file system.
.It Sy msdos
FAT12, FAT16, or FAT32 file system.
.It Sy zfs
ZFS pool containing one or more file systems.
.El
.It Fl x
Exclude file system nodes not explicitly listed in the specfile.
@ -494,10 +496,97 @@ Volume ID.
.It Cm volume_label
Volume Label.
.El
.Ss zfs-specific options
Note: ZFS support is currently considered experimental.
Do not use it for anything critical.
.Pp
The image created by
.Nm
contains a ZFS pool with a single vdev of type
.Ql disk .
The root dataset is always created implicitly and contains the entire input
directory tree unless additional datasets are specified using the options
described below.
.Pp
The arguments consist of a keyword, an equal sign
.Pq Ql = ,
and a value.
The following keywords are supported:
.Pp
.Bl -tag -width omit-trailing-period -offset indent -compact
.It ashift
The base-2 logarithm of the minimum block size.
Typical values are 9 (512B blocks) and 12 (4KB blocks).
The default value is 12.
.It bootfs
The name of the bootable dataset for the pool.
Specifying this option causes the
.Ql bootfs
property to be set in the created pool.
.It mssize
The size of metaslabs in the created pool.
By default,
.Nm
allocates large (up to 512MB) metaslabs with the expectation that
the image will be auto-expanded upon first use.
This option allows the default heuristic to be overridden.
.It poolname
The name of the ZFS pool.
This option must be specified.
.It rootpath
An implicit path prefix added to dataset mountpoints.
By default it is
.Pa /<poolname> .
For creating bootable pools, the
.Va rootpath
should be set to
.Pa / .
At least one dataset must have a mountpoint equal to
.Va rootpath .
.It fs
Create an additional dataset.
This option may be specified multiple times.
The argument value must be of the form
.Ar <dataset>[;<prop1=v1>[;<prop2=v2>[;...]]] ,
where
.Ar dataset
is the name of the dataset and must belong to the pool's namespace.
For example, with a pool name of
.Ql test
all dataset names must be prefixed by
.Ql test/ .
A dataset must exist at each level of the pool's namespace.
For example, to create
.Ql test/foo/bar ,
.Ql test/foo
must be created as well.
.Pp
The dataset mountpoints determine how the datasets are populated with
files from the staged directory tree.
Conceptually, all datasets are mounted before any are populated with files.
The root of the staged directory tree is mapped to
.Va rootpath .
.Pp
Dataset properties, as described in
.Xr zfsprops 8 ,
may be specified following the dataset name.
The following properties may be set for a dataset:
.Pp
.Bl -tag -compact -offset indent
.It atime
.It canmount
.It exec
.It mountpoint
.It setuid
.El
.El
.Sh SEE ALSO
.Xr mtree 5 ,
.Xr mtree 8 ,
.Xr newfs 8
.Xr newfs 8 ,
.Xr zfsconcepts 8 ,
.Xr zfsprops 8 ,
.Xr zpoolprops 8
.Sh HISTORY
The
.Nm
@ -518,4 +607,6 @@ and first appeared in
.An Ram Vedam
(cd9660 support),
.An Christos Zoulas
(msdos support).
(msdos support),
.An Mark Johnston
(zfs support).

View File

@ -77,6 +77,9 @@ static fstype_t fstypes[] = {
ENTRY(cd9660),
ENTRY(ffs),
ENTRY(msdos),
#ifdef HAVE_ZFS
ENTRY(zfs),
#endif
{ .type = NULL },
};

View File

@ -78,12 +78,14 @@ enum fi_flags {
FI_SIZED = 1<<0, /* inode sized */
FI_ALLOCATED = 1<<1, /* fsinode->ino allocated */
FI_WRITTEN = 1<<2, /* inode written */
FI_ROOT = 1<<3, /* root of a ZFS dataset */
};
typedef struct {
uint32_t ino; /* inode number used on target fs */
uint32_t nlink; /* number of links to this entry */
enum fi_flags flags; /* flags used by fs specific code */
void *param; /* for use by individual fs impls */
struct stat st; /* stat entry */
} fsinode;
@ -186,6 +188,9 @@ void fs ## _makefs(const char *, const char *, fsnode *, fsinfo_t *)
DECLARE_FUN(cd9660);
DECLARE_FUN(ffs);
DECLARE_FUN(msdos);
#ifdef HAVE_ZFS
DECLARE_FUN(zfs);
#endif
extern u_int debug;
extern int dupsok;

View File

@ -2,6 +2,7 @@
ATF_TESTS_SH+= makefs_cd9660_tests
ATF_TESTS_SH+= makefs_ffs_tests
ATF_TESTS_SH+= makefs_zfs_tests
BINDIR= ${TESTSDIR}

View File

@ -0,0 +1,634 @@
#-
# SPDX-License-Identifier: BSD-2-Clause-FreeBSD
#
# Copyright (c) 2022 The FreeBSD Foundation
#
# This software was developed by Mark Johnston under sponsorship from
# the FreeBSD Foundation.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in
# the documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
MAKEFS="makefs -t zfs -o nowarn=true"
ZFS_POOL_NAME="makefstest$$"
TEST_ZFS_POOL_NAME="$TMPDIR/poolname"
. "$(dirname "$0")/makefs_tests_common.sh"
common_cleanup()
{
local pool md
# Try to force a TXG, this can help catch bugs by triggering a panic.
sync
pool=$(cat $TEST_ZFS_POOL_NAME)
if zpool list "$pool" >/dev/null; then
zpool destroy "$pool"
fi
md=$(cat $TEST_MD_DEVICE_FILE)
if [ -c /dev/"$md" ]; then
mdconfig -d -u "$md"
fi
}
import_image()
{
atf_check -e empty -o save:$TEST_MD_DEVICE_FILE -s exit:0 \
mdconfig -a -f $TEST_IMAGE
atf_check zpool import -R $TEST_MOUNT_DIR $ZFS_POOL_NAME
echo "$ZFS_POOL_NAME" > $TEST_ZFS_POOL_NAME
}
#
# Test autoexpansion of the vdev.
#
# The pool is initially 10GB, so we get 10GB minus one metaslab's worth of
# usable space for data. Then the pool is expanded to 50GB, and the amount of
# usable space is 50GB minus one metaslab.
#
atf_test_case autoexpand cleanup
autoexpand_body()
{
local mssize poolsize poolsize1 newpoolsize
create_test_inputs
mssize=$((128 * 1024 * 1024))
poolsize=$((10 * 1024 * 1024 * 1024))
atf_check $MAKEFS -s $poolsize -o mssize=$mssize -o rootpath=/ \
-o poolname=$ZFS_POOL_NAME \
$TEST_IMAGE $TEST_INPUTS_DIR
newpoolsize=$((50 * 1024 * 1024 * 1024))
truncate -s $newpoolsize $TEST_IMAGE
import_image
check_image_contents
poolsize1=$(zpool list -Hp -o size $ZFS_POOL_NAME)
atf_check [ $((poolsize1 + $mssize)) -eq $poolsize ]
atf_check zpool online -e $ZFS_POOL_NAME /dev/$(cat $TEST_MD_DEVICE_FILE)
check_image_contents
poolsize1=$(zpool list -Hp -o size $ZFS_POOL_NAME)
atf_check [ $((poolsize1 + $mssize)) -eq $newpoolsize ]
}
autoexpand_cleanup()
{
common_cleanup
}
#
# Test with some default layout defined by the common code.
#
atf_test_case basic cleanup
basic_body()
{
create_test_inputs
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
}
basic_cleanup()
{
common_cleanup
}
atf_test_case dataset_removal cleanup
dataset_removal_body()
{
create_test_dirs
cd $TEST_INPUTS_DIR
mkdir dir
cd -
atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
-o fs=${ZFS_POOL_NAME}/dir \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
atf_check zfs destroy ${ZFS_POOL_NAME}/dir
}
dataset_removal_cleanup()
{
common_cleanup
}
#
# Make sure that we can create and remove an empty directory.
#
atf_test_case empty_dir cleanup
empty_dir_body()
{
create_test_dirs
cd $TEST_INPUTS_DIR
mkdir dir
cd -
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
atf_check rmdir ${TEST_MOUNT_DIR}/dir
}
empty_dir_cleanup()
{
common_cleanup
}
atf_test_case empty_fs cleanup
empty_fs_body()
{
create_test_dirs
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
}
empty_fs_cleanup()
{
common_cleanup
}
atf_test_case file_sizes cleanup
file_sizes_body()
{
local i
create_test_dirs
cd $TEST_INPUTS_DIR
i=1
while [ $i -lt $((1 << 20)) ]; do
truncate -s $i ${i}.1
truncate -s $(($i - 1)) ${i}.2
truncate -s $(($i + 1)) ${i}.3
i=$(($i << 1))
done
cd -
# XXXMJ this creates sparse files, make sure makefs doesn't
# preserve the sparseness.
# XXXMJ need to test with larger files (at least 128MB for L2 indirs)
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
}
file_sizes_cleanup()
{
common_cleanup
}
atf_test_case hard_links cleanup
hard_links_body()
{
local f
create_test_dirs
cd $TEST_INPUTS_DIR
mkdir dir
echo "hello" > 1
ln 1 2
ln 1 dir/1
echo "goodbye" > dir/a
ln dir/a dir/b
ln dir/a a
cd -
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
stat -f '%i' ${TEST_MOUNT_DIR}/1 > ./ino
stat -f '%l' ${TEST_MOUNT_DIR}/1 > ./nlink
for f in 1 2 dir/1; do
atf_check -o file:./nlink -e empty -s exit:0 \
stat -f '%l' ${TEST_MOUNT_DIR}/${f}
atf_check -o file:./ino -e empty -s exit:0 \
stat -f '%i' ${TEST_MOUNT_DIR}/${f}
atf_check cmp -s ${TEST_INPUTS_DIR}/1 ${TEST_MOUNT_DIR}/${f}
done
stat -f '%i' ${TEST_MOUNT_DIR}/dir/a > ./ino
stat -f '%l' ${TEST_MOUNT_DIR}/dir/a > ./nlink
for f in dir/a dir/b a; do
atf_check -o file:./nlink -e empty -s exit:0 \
stat -f '%l' ${TEST_MOUNT_DIR}/${f}
atf_check -o file:./ino -e empty -s exit:0 \
stat -f '%i' ${TEST_MOUNT_DIR}/${f}
atf_check cmp -s ${TEST_INPUTS_DIR}/dir/a ${TEST_MOUNT_DIR}/${f}
done
}
hard_links_cleanup()
{
common_cleanup
}
# Allocate enough dnodes from an object set that the meta dnode needs to use
# indirect blocks.
atf_test_case indirect_dnode_array cleanup
indirect_dnode_array_body()
{
local count i
# How many dnodes do we need to allocate? Well, the data block size
# for meta dnodes is always 16KB, so with a dnode size of 512B we get
# 32 dnodes per direct block. The maximum indirect block size is 128KB
# and that can fit 1024 block pointers, so we need at least 32 * 1024
# files to force the use of two levels of indirection.
#
# Unfortunately that number of files makes the test run quite slowly,
# so we settle for a single indirect block for now...
count=$(jot -r 1 32 1024)
create_test_dirs
cd $TEST_INPUTS_DIR
for i in $(seq 1 $count); do
touch $i
done
cd -
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
}
indirect_dnode_array_cleanup()
{
common_cleanup
}
#
# Create some files with long names, so as to test fat ZAP handling.
#
atf_test_case long_file_name cleanup
long_file_name_body()
{
local dir i
create_test_dirs
cd $TEST_INPUTS_DIR
# micro ZAP keys can be at most 50 bytes.
for i in $(seq 1 60); do
touch $(jot -s '' $i 1 1)
done
dir=$(jot -s '' 61 1 1)
mkdir $dir
for i in $(seq 1 60); do
touch ${dir}/$(jot -s '' $i 1 1)
done
cd -
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
# Add a directory entry in the hope that OpenZFS might catch a bug
# in makefs' fat ZAP encoding.
touch ${TEST_MOUNT_DIR}/foo
}
long_file_name_cleanup()
{
common_cleanup
}
#
# Exercise handling of multiple datasets.
#
atf_test_case multi_dataset_1 cleanup
multi_dataset_1_body()
{
create_test_dirs
cd $TEST_INPUTS_DIR
mkdir dir1
echo a > dir1/a
mkdir dir2
echo b > dir2/b
cd -
atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
-o fs=${ZFS_POOL_NAME}/dir1 -o fs=${ZFS_POOL_NAME}/dir2 \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
# Make sure that we have three datasets with the expected mount points.
atf_check -o inline:${ZFS_POOL_NAME}\\n -e empty -s exit:0 \
zfs list -H -o name ${ZFS_POOL_NAME}
atf_check -o inline:${TEST_MOUNT_DIR}\\n -e empty -s exit:0 \
zfs list -H -o mountpoint ${ZFS_POOL_NAME}
atf_check -o inline:${ZFS_POOL_NAME}/dir1\\n -e empty -s exit:0 \
zfs list -H -o name ${ZFS_POOL_NAME}/dir1
atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \
zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
atf_check -o inline:${ZFS_POOL_NAME}/dir2\\n -e empty -s exit:0 \
zfs list -H -o name ${ZFS_POOL_NAME}/dir2
atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \
zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2
}
multi_dataset_1_cleanup()
{
common_cleanup
}
#
# Create a pool with two datasets, where the root dataset is mounted below
# the child dataset.
#
atf_test_case multi_dataset_2 cleanup
multi_dataset_2_body()
{
create_test_dirs
cd $TEST_INPUTS_DIR
mkdir dir1
echo a > dir1/a
mkdir dir2
echo b > dir2/b
cd -
atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
-o fs=${ZFS_POOL_NAME}/dir1\;mountpoint=/ \
-o fs=${ZFS_POOL_NAME}\;mountpoint=/dir1 \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
}
multi_dataset_2_cleanup()
{
common_cleanup
}
#
# Create a dataset with a non-existent mount point.
#
atf_test_case multi_dataset_3 cleanup
multi_dataset_3_body()
{
create_test_dirs
cd $TEST_INPUTS_DIR
mkdir dir1
echo a > dir1/a
cd -
atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
-o fs=${ZFS_POOL_NAME}/dir1 \
-o fs=${ZFS_POOL_NAME}/dir2 \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
atf_check -o inline:${TEST_MOUNT_DIR}/dir2\\n -e empty -s exit:0 \
zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir2
# Mounting dir2 should have created a directory called dir2. Go
# back and create it in the staging tree before comparing.
atf_check mkdir ${TEST_INPUTS_DIR}/dir2
check_image_contents
}
multi_dataset_3_cleanup()
{
common_cleanup
}
#
# Create an unmounted dataset.
#
atf_test_case multi_dataset_4 cleanup
multi_dataset_4_body()
{
create_test_dirs
cd $TEST_INPUTS_DIR
mkdir dir1
echo a > dir1/a
cd -
atf_check $MAKEFS -s 1g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
-o fs=${ZFS_POOL_NAME}/dir1\;canmount=noauto\;mountpoint=none \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
atf_check -o inline:none\\n -e empty -s exit:0 \
zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
check_image_contents
atf_check zfs set mountpoint=/dir1 ${ZFS_POOL_NAME}/dir1
atf_check zfs mount ${ZFS_POOL_NAME}/dir1
atf_check -o inline:${TEST_MOUNT_DIR}/dir1\\n -e empty -s exit:0 \
zfs list -H -o mountpoint ${ZFS_POOL_NAME}/dir1
# dir1/a should be part of the root dataset, not dir1.
atf_check -s not-exit:0 -e not-empty stat ${TEST_MOUNT_DIR}dir1/a
}
multi_dataset_4_cleanup()
{
common_cleanup
}
#
# Rudimentary test to verify that two ZFS images created using the same
# parameters and input hierarchy are byte-identical. In particular, makefs(1)
# does not preserve file access times.
#
atf_test_case reproducible cleanup
reproducible_body()
{
create_test_inputs
atf_check $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
${TEST_IMAGE}.1 $TEST_INPUTS_DIR
atf_check $MAKEFS -s 512m -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
${TEST_IMAGE}.2 $TEST_INPUTS_DIR
# XXX-MJ cmp(1) is really slow
atf_check cmp ${TEST_IMAGE}.1 ${TEST_IMAGE}.2
}
reproducible_cleanup()
{
}
#
# Verify that we can take a snapshot of a generated dataset.
#
atf_test_case snapshot cleanup
snapshot_body()
{
create_test_dirs
cd $TEST_INPUTS_DIR
mkdir dir
echo "hello" > dir/hello
echo "goodbye" > goodbye
cd -
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
atf_check zfs snapshot ${ZFS_POOL_NAME}@1
}
snapshot_cleanup()
{
common_cleanup
}
#
# Check handling of symbolic links.
#
atf_test_case soft_links cleanup
soft_links_body()
{
create_test_dirs
cd $TEST_INPUTS_DIR
mkdir dir
ln -s a a
ln -s dir/../a a
ln -s dir/b b
echo 'c' > dir
ln -s dir/c c
# XXX-MJ overflows bonus buffer ln -s $(jot -s '' 320 1 1) 1
cd -
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
}
soft_links_cleanup()
{
common_cleanup
}
#
# Verify that we can set properties on the root dataset.
#
atf_test_case root_props cleanup
root_props_body()
{
create_test_inputs
atf_check $MAKEFS -s 10g -o rootpath=/ -o poolname=$ZFS_POOL_NAME \
-o fs=${ZFS_POOL_NAME}\;atime=off\;setuid=off \
$TEST_IMAGE $TEST_INPUTS_DIR
import_image
check_image_contents
atf_check -o inline:off\\n -e empty -s exit:0 \
zfs get -H -o value atime $ZFS_POOL_NAME
atf_check -o inline:local\\n -e empty -s exit:0 \
zfs get -H -o source atime $ZFS_POOL_NAME
atf_check -o inline:off\\n -e empty -s exit:0 \
zfs get -H -o value setuid $ZFS_POOL_NAME
atf_check -o inline:local\\n -e empty -s exit:0 \
zfs get -H -o source setuid $ZFS_POOL_NAME
}
root_props_cleanup()
{
common_cleanup
}
atf_init_test_cases()
{
atf_add_test_case autoexpand
atf_add_test_case basic
atf_add_test_case dataset_removal
atf_add_test_case empty_dir
atf_add_test_case empty_fs
atf_add_test_case file_sizes
atf_add_test_case hard_links
atf_add_test_case indirect_dnode_array
atf_add_test_case long_file_name
atf_add_test_case multi_dataset_1
atf_add_test_case multi_dataset_2
atf_add_test_case multi_dataset_3
atf_add_test_case multi_dataset_4
atf_add_test_case reproducible
atf_add_test_case snapshot
atf_add_test_case soft_links
atf_add_test_case root_props
# XXXMJ tests:
# - test with different ashifts (at least, 9 and 12), different image sizes
# - create datasets in imported pool
}

758
usr.sbin/makefs/zfs.c Normal file
View File

@ -0,0 +1,758 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2022 The FreeBSD Foundation
*
* This software was developed by Mark Johnston under sponsorship from
* the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/queue.h>
#include <assert.h>
#include <fcntl.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <util.h>
#include "makefs.h"
#include "zfs.h"
#define VDEV_LABEL_SPACE \
((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
_Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
#define MINMSSIZE ((off_t)1 << 24) /* 16MB */
#define DFLTMSSIZE ((off_t)1 << 29) /* 512MB */
#define MAXMSSIZE ((off_t)1 << 34) /* 16GB */
#define INDIR_LEVELS 6
/* Indirect blocks are always 128KB. */
#define BLKPTR_PER_INDIR (MAXBLOCKSIZE / sizeof(blkptr_t))
struct dnode_cursor {
char inddir[INDIR_LEVELS][MAXBLOCKSIZE];
off_t indloc;
off_t indspace;
dnode_phys_t *dnode;
off_t dataoff;
off_t datablksz;
};
void
zfs_prep_opts(fsinfo_t *fsopts)
{
zfs_opt_t *zfs = ecalloc(1, sizeof(*zfs));
const option_t zfs_options[] = {
{ '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
0, 0, "Bootable dataset" },
{ '\0', "mssize", &zfs->mssize, OPT_INT64,
MINMSSIZE, MAXMSSIZE, "Metaslab size" },
{ '\0', "poolname", &zfs->poolname, OPT_STRPTR,
0, 0, "ZFS pool name" },
{ '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
0, 0, "Prefix for all dataset mount points" },
{ '\0', "ashift", &zfs->ashift, OPT_INT32,
MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
{ '\0', "nowarn", &zfs->nowarn, OPT_BOOL,
0, 0, "Suppress warning about experimental ZFS support" },
{ .name = NULL }
};
STAILQ_INIT(&zfs->datasetdescs);
fsopts->fs_specific = zfs;
fsopts->fs_options = copy_opts(zfs_options);
}
int
zfs_parse_opts(const char *option, fsinfo_t *fsopts)
{
zfs_opt_t *zfs;
struct dataset_desc *dsdesc;
char buf[BUFSIZ], *opt, *val;
int rv;
zfs = fsopts->fs_specific;
opt = val = estrdup(option);
opt = strsep(&val, "=");
if (strcmp(opt, "fs") == 0) {
if (val == NULL)
errx(1, "invalid filesystem parameters `%s'", option);
/*
* Dataset descriptions will be parsed later, in dsl_init().
* Just stash them away for now.
*/
dsdesc = ecalloc(1, sizeof(*dsdesc));
dsdesc->params = estrdup(val);
free(opt);
STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next);
return (1);
}
free(opt);
rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
return (rv == -1 ? 0 : 1);
}
static void
zfs_size_vdev(fsinfo_t *fsopts)
{
zfs_opt_t *zfs;
off_t asize, mssize, vdevsize, vdevsize1;
zfs = fsopts->fs_specific;
assert(fsopts->maxsize != 0);
assert(zfs->ashift != 0);
/*
* Figure out how big the vdev should be.
*/
vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift);
if (vdevsize < MINDEVSIZE)
errx(1, "maximum image size is too small");
if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) {
errx(1, "image size bounds must be multiples of %d",
1 << zfs->ashift);
}
asize = vdevsize - VDEV_LABEL_SPACE;
/*
* Size metaslabs according to the following heuristic:
* - provide at least 8 metaslabs,
* - without using a metaslab size larger than 512MB.
* This approximates what OpenZFS does without being complicated. In
* practice we expect pools to be expanded upon first use, and OpenZFS
* does not resize metaslabs in that case, so there is no right answer
* here. In general we want to provide large metaslabs even if the
* image size is small, and 512MB is a reasonable size for pools up to
* several hundred gigabytes.
*
* The user may override this heuristic using the "-o mssize" option.
*/
mssize = zfs->mssize;
if (mssize == 0) {
mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE);
if (!powerof2(mssize))
mssize = 1l << (flsll(mssize) - 1);
}
if (!powerof2(mssize))
errx(1, "metaslab size must be a power of 2");
/*
* If we have some slop left over, try to cover it by resizing the vdev,
* subject to the maxsize and minsize parameters.
*/
if (asize % mssize != 0) {
vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE;
if (vdevsize1 < fsopts->minsize)
vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE;
if (vdevsize1 <= fsopts->maxsize)
vdevsize = vdevsize1;
}
asize = vdevsize - VDEV_LABEL_SPACE;
zfs->asize = asize;
zfs->vdevsize = vdevsize;
zfs->mssize = mssize;
zfs->msshift = flsll(mssize) - 1;
zfs->mscount = asize / mssize;
}
/*
* Validate options and set some default values.
*/
static void
zfs_check_opts(fsinfo_t *fsopts)
{
zfs_opt_t *zfs;
zfs = fsopts->fs_specific;
if (fsopts->offset != 0)
errx(1, "unhandled offset option");
if (fsopts->maxsize == 0)
errx(1, "an image size must be specified");
if (zfs->poolname == NULL)
errx(1, "a pool name must be specified");
if (zfs->rootpath == NULL)
easprintf(&zfs->rootpath, "/%s", zfs->poolname);
if (zfs->rootpath[0] != '/')
errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
if (zfs->ashift == 0)
zfs->ashift = 12;
zfs_size_vdev(fsopts);
}
void
zfs_cleanup_opts(fsinfo_t *fsopts)
{
struct dataset_desc *d, *tmp;
zfs_opt_t *zfs;
zfs = fsopts->fs_specific;
free(zfs->rootpath);
free(zfs->bootfs);
free(__DECONST(void *, zfs->poolname));
STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) {
free(d->params);
free(d);
}
free(zfs);
free(fsopts->fs_options);
}
static size_t
nvlist_size(const nvlist_t *nvl)
{
return (sizeof(nvl->nv_header) + nvl->nv_size);
}
static void
nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
{
assert(sz >= nvlist_size(nvl));
memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
}
static nvlist_t *
pool_config_nvcreate(zfs_opt_t *zfs)
{
nvlist_t *featuresnv, *poolnv;
poolnv = nvlist_create(NV_UNIQUE_NAME);
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG);
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid);
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid);
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
featuresnv = nvlist_create(NV_UNIQUE_NAME);
nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
nvlist_destroy(featuresnv);
return (poolnv);
}
static nvlist_t *
pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
{
nvlist_t *diskvdevnv;
assert(zfs->objarrid != 0);
diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
zfs->objarrid);
nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
zfs->msshift);
return (diskvdevnv);
}
static nvlist_t *
pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
{
nvlist_t *diskvdevnv, *rootvdevnv;
diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid);
nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
1);
nvlist_destroy(diskvdevnv);
return (rootvdevnv);
}
/*
* Create the pool's "config" object, which contains an nvlist describing pool
* parameters and the vdev topology. It is similar but not identical to the
* nvlist stored in vdev labels. The main difference is that vdev labels do not
* describe the full vdev tree and in particular do not contain the "root"
* meta-vdev.
*/
static void
pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
{
dnode_phys_t *dnode;
nvlist_t *poolconfig, *vdevconfig;
void *configbuf;
uint64_t dnid;
off_t configloc, configblksz;
int error;
dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST,
DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
poolconfig = pool_config_nvcreate(zfs);
vdevconfig = pool_root_vdev_config_nvcreate(zfs);
nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
nvlist_destroy(vdevconfig);
error = nvlist_export(poolconfig);
if (error != 0)
errc(1, error, "nvlist_export");
configblksz = nvlist_size(poolconfig);
configloc = objset_space_alloc(zfs, zfs->mos, &configblksz);
configbuf = ecalloc(1, configblksz);
nvlist_copy(poolconfig, configbuf, configblksz);
vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
dnode->dn_flags = DNODE_FLAG_USED_BYTES;
*(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
nvlist_destroy(poolconfig);
free(configbuf);
}
/*
* Add objects block pointer list objects, used for deferred frees. We don't do
* anything with them, but they need to be present or OpenZFS will refuse to
* import the pool.
*/
static void
pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
{
uint64_t dnid;
(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
BPOBJ_SIZE_V2, &dnid);
zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
BPOBJ_SIZE_V2, &dnid);
zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
}
/*
* Add required feature metadata objects. We don't know anything about ZFS
* features, so the objects are just empty ZAPs.
*/
static void
pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
{
dnode_phys_t *dnode;
uint64_t dnid;
dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
zap_write(zfs, zap_alloc(zfs->mos, dnode));
dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
zap_write(zfs, zap_alloc(zfs->mos, dnode));
dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
zap_write(zfs, zap_alloc(zfs->mos, dnode));
}
static void
pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
{
zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET,
dsl_dir_id(zfs->rootdsldir));
}
static void
pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
{
dnode_phys_t *dnode;
uint64_t id;
dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id);
zap_add_uint64(objdir, DMU_POOL_PROPS, id);
zfs->poolprops = zap_alloc(zfs->mos, dnode);
}
/*
* Initialize the MOS object directory, the root of virtually all of the pool's
* data and metadata.
*/
static void
pool_init_objdir(zfs_opt_t *zfs)
{
zfs_zap_t *zap;
dnode_phys_t *objdir;
objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
zap = zap_alloc(zfs->mos, objdir);
pool_init_objdir_config(zfs, zap);
pool_init_objdir_bplists(zfs, zap);
pool_init_objdir_feature_maps(zfs, zap);
pool_init_objdir_dsl(zfs, zap);
pool_init_objdir_poolprops(zfs, zap);
zap_write(zfs, zap);
}
/*
* Initialize the meta-object set (MOS) and immediately write out several
* special objects whose contents are already finalized, including the object
* directory.
*
* Once the MOS is finalized, it'll look roughly like this:
*
* object directory (ZAP)
* |-> vdev config object (nvlist)
* |-> features for read
* |-> features for write
* |-> feature descriptions
* |-> sync bplist
* |-> free bplist
* |-> pool properties
* L-> root DSL directory
* |-> DSL child directory (ZAP)
* | |-> $MOS (DSL dir)
* | | |-> child map
* | | L-> props (ZAP)
* | |-> $FREE (DSL dir)
* | | |-> child map
* | | L-> props (ZAP)
* | |-> $ORIGIN (DSL dir)
* | | |-> child map
* | | |-> dataset
* | | | L-> deadlist
* | | |-> snapshot
* | | | |-> deadlist
* | | | L-> snapshot names
* | | |-> props (ZAP)
* | | L-> clones (ZAP)
* | |-> dataset 1 (DSL dir)
* | | |-> DSL dataset
* | | | |-> snapshot names
* | | | L-> deadlist
* | | |-> child map
* | | | L-> ...
* | | L-> props
* | |-> dataset 2
* | | L-> ...
* | |-> ...
* | L-> dataset n
* |-> DSL root dataset
* | |-> snapshot names
* | L-> deadlist
* L-> props (ZAP)
* space map object array
* |-> space map 1
* |-> space map 2
* |-> ...
* L-> space map n (zfs->mscount)
*
* The space map object array is pointed to by the "msarray" property in the
* pool configuration.
*/
static void
pool_init(zfs_opt_t *zfs)
{
uint64_t dnid;
zfs->poolguid = ((uint64_t)random() << 32) | random();
zfs->vdevguid = ((uint64_t)random() << 32) | random();
zfs->mos = objset_alloc(zfs, DMU_OST_META);
(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
dsl_init(zfs);
pool_init_objdir(zfs);
}
static void
pool_labels_write(zfs_opt_t *zfs)
{
uberblock_t *ub;
vdev_label_t *label;
nvlist_t *poolconfig, *vdevconfig;
int error;
label = ecalloc(1, sizeof(*label));
/*
* Assemble the vdev configuration and store it in the label.
*/
poolconfig = pool_config_nvcreate(zfs);
vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
nvlist_destroy(vdevconfig);
error = nvlist_export(poolconfig);
if (error != 0)
errc(1, error, "nvlist_export");
nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
sizeof(label->vl_vdev_phys.vp_nvlist));
nvlist_destroy(poolconfig);
/*
* Fill out the uberblock. Just make each one the same. The embedded
* checksum is calculated in vdev_label_write().
*/
for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
uoff += (1 << zfs->ashift)) {
ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
ub->ub_magic = UBERBLOCK_MAGIC;
ub->ub_version = SPA_VERSION;
ub->ub_txg = TXG;
ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid;
ub->ub_timestamp = 0;
ub->ub_software_version = SPA_VERSION;
ub->ub_mmp_magic = MMP_MAGIC;
ub->ub_mmp_delay = 0;
ub->ub_mmp_config = 0;
ub->ub_checkpoint_txg = 0;
objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp);
}
/*
* Write out four copies of the label: two at the beginning of the vdev
* and two at the end.
*/
for (int i = 0; i < VDEV_LABELS; i++)
vdev_label_write(zfs, i, label);
free(label);
}
static void
pool_fini(zfs_opt_t *zfs)
{
zap_write(zfs, zfs->poolprops);
dsl_write(zfs);
objset_write(zfs, zfs->mos);
pool_labels_write(zfs);
}
struct dnode_cursor *
dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
off_t size, off_t blksz)
{
struct dnode_cursor *c;
uint64_t nbppindir, indlevel, ndatablks, nindblks;
assert(dnode->dn_nblkptr == 1);
assert(blksz <= MAXBLOCKSIZE);
if (blksz == 0) {
/* Must be between 1<<ashift and 128KB. */
blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
powerof2(size) ? size : (1ul << flsll(size))));
}
assert(powerof2(blksz));
/*
* Do we need indirect blocks? Figure out how many levels are needed
* (indlevel == 1 means no indirect blocks) and how much space is needed
* (it has to be allocated up-front to break the dependency cycle
* described in objset_write()).
*/
ndatablks = size == 0 ? 0 : howmany(size, blksz);
nindblks = 0;
for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
nbppindir *= BLKPTR_PER_INDIR;
nindblks += howmany(ndatablks, indlevel * nbppindir);
}
assert(indlevel < INDIR_LEVELS);
dnode->dn_nlevels = (uint8_t)indlevel;
dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
c = ecalloc(1, sizeof(*c));
if (nindblks > 0) {
c->indspace = nindblks * MAXBLOCKSIZE;
c->indloc = objset_space_alloc(zfs, os, &c->indspace);
}
c->dnode = dnode;
c->dataoff = 0;
c->datablksz = blksz;
return (c);
}
static void
_dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, int levels)
{
blkptr_t *bp, *pbp;
void *buf;
uint64_t fill;
off_t blkid, blksz, loc;
assert(levels > 0);
assert(levels <= c->dnode->dn_nlevels - 1);
blksz = MAXBLOCKSIZE;
blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
for (int level = 1; level <= levels; level++) {
buf = c->inddir[level - 1];
if (level == c->dnode->dn_nlevels - 1) {
pbp = &c->dnode->dn_blkptr[0];
} else {
uint64_t iblkid;
iblkid = blkid & (BLKPTR_PER_INDIR - 1);
pbp = (blkptr_t *)
&c->inddir[level][iblkid * sizeof(blkptr_t)];
}
/*
* Space for indirect blocks is allocated up-front; see the
* comment in objset_write().
*/
loc = c->indloc;
c->indloc += blksz;
assert(c->indspace >= blksz);
c->indspace -= blksz;
bp = buf;
fill = 0;
for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
fill += BP_GET_FILL(&bp[i]);
vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
loc, pbp);
memset(buf, 0, MAXBLOCKSIZE);
blkid /= BLKPTR_PER_INDIR;
}
}
blkptr_t *
dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
{
off_t blkid, l1id;
int levels;
if (c->dnode->dn_nlevels == 1) {
assert(off < MAXBLOCKSIZE);
return (&c->dnode->dn_blkptr[0]);
}
assert(off % c->datablksz == 0);
/* Do we need to flush any full indirect blocks? */
if (off > 0) {
blkid = off / c->datablksz;
for (levels = 0; levels < c->dnode->dn_nlevels - 1; levels++) {
if (blkid % BLKPTR_PER_INDIR != 0)
break;
blkid /= BLKPTR_PER_INDIR;
}
if (levels > 0)
_dnode_cursor_flush(zfs, c, levels);
}
c->dataoff = off;
l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
}
void
dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
{
int levels;
levels = c->dnode->dn_nlevels - 1;
if (levels > 0)
_dnode_cursor_flush(zfs, c, levels);
assert(c->indspace == 0);
free(c);
}
void
zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
{
zfs_opt_t *zfs;
int dirfd;
zfs = fsopts->fs_specific;
/*
* Use a fixed seed to provide reproducible pseudo-random numbers for
* on-disk structures when needed (e.g., GUIDs, ZAP hash salts).
*/
srandom(1729);
zfs_check_opts(fsopts);
if (!zfs->nowarn) {
fprintf(stderr,
"ZFS support is currently considered experimental. "
"Do not use it for anything critical.\n");
}
dirfd = open(dir, O_DIRECTORY | O_RDONLY);
if (dirfd < 0)
err(1, "open(%s)", dir);
vdev_init(zfs, image);
pool_init(zfs);
fs_build(zfs, dirfd, root);
pool_fini(zfs);
vdev_fini(zfs);
}

View File

@ -0,0 +1,12 @@
.PATH: ${SRCDIR}/zfs
.PATH: ${SRCTOP}/stand/libsa/zfs
SRCS+= dsl.c \
fs.c \
objset.c \
vdev.c \
zap.c
SRCS+= nvlist.c
CFLAGS.nvlist.c+= -I${SRCTOP}/stand/libsa -Wno-cast-qual

598
usr.sbin/makefs/zfs/dsl.c Normal file
View File

@ -0,0 +1,598 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2022 The FreeBSD Foundation
*
* This software was developed by Mark Johnston under sponsorship from
* the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <assert.h>
#include <string.h>
#include <util.h>
#include "makefs.h"
#include "zfs.h"
typedef struct zfs_dsl_dataset {
zfs_objset_t *os; /* referenced objset, may be null */
dsl_dataset_phys_t *phys; /* on-disk representation */
uint64_t dsid; /* DSL dataset dnode */
struct zfs_dsl_dir *dir; /* containing parent */
} zfs_dsl_dataset_t;
typedef STAILQ_HEAD(zfs_dsl_dir_list, zfs_dsl_dir) zfs_dsl_dir_list_t;
typedef struct zfs_dsl_dir {
char *fullname; /* full dataset name */
char *name; /* basename(fullname) */
dsl_dir_phys_t *phys; /* on-disk representation */
nvlist_t *propsnv; /* properties saved in propszap */
zfs_dsl_dataset_t *headds; /* principal dataset, may be null */
uint64_t dirid; /* DSL directory dnode */
zfs_zap_t *propszap; /* dataset properties */
zfs_zap_t *childzap; /* child directories */
/* DSL directory tree linkage. */
struct zfs_dsl_dir *parent;
zfs_dsl_dir_list_t children;
STAILQ_ENTRY(zfs_dsl_dir) next;
} zfs_dsl_dir_t;
static zfs_dsl_dir_t *dsl_dir_alloc(zfs_opt_t *zfs, const char *name);
static zfs_dsl_dataset_t *dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir);
static int
nvlist_find_string(nvlist_t *nvl, const char *key, char **retp)
{
char *str;
int error, len;
error = nvlist_find(nvl, key, DATA_TYPE_STRING, NULL, &str, &len);
if (error == 0) {
*retp = ecalloc(1, len + 1);
memcpy(*retp, str, len);
}
return (error);
}
static int
nvlist_find_uint64(nvlist_t *nvl, const char *key, uint64_t *retp)
{
return (nvlist_find(nvl, key, DATA_TYPE_UINT64, NULL, retp, NULL));
}
/*
* Return an allocated string containing the head dataset's mountpoint,
* including the root path prefix.
*
* If the dataset has a mountpoint property, it is returned. Otherwise we have
* to follow ZFS' inheritance rules.
*/
char *
dsl_dir_get_mountpoint(zfs_opt_t *zfs, zfs_dsl_dir_t *dir)
{
zfs_dsl_dir_t *pdir;
char *mountpoint, *origmountpoint;
if (nvlist_find_string(dir->propsnv, "mountpoint", &mountpoint) == 0) {
if (strcmp(mountpoint, "none") == 0)
return (NULL);
/*
* nvlist_find_string() does not make a copy.
*/
mountpoint = estrdup(mountpoint);
} else {
/*
* If we don't have a mountpoint, it's inherited from one of our
* ancestors. Walk up the hierarchy until we find it, building
* up our mountpoint along the way. The mountpoint property is
* always set for the root dataset.
*/
for (pdir = dir->parent, mountpoint = estrdup(dir->name);;) {
origmountpoint = mountpoint;
if (nvlist_find_string(pdir->propsnv, "mountpoint",
&mountpoint) == 0) {
easprintf(&mountpoint, "%s%s%s", mountpoint,
mountpoint[strlen(mountpoint) - 1] == '/' ?
"" : "/", origmountpoint);
free(origmountpoint);
break;
}
easprintf(&mountpoint, "%s/%s", pdir->name,
origmountpoint);
free(origmountpoint);
pdir = pdir->parent;
}
}
assert(mountpoint[0] == '/');
assert(strstr(mountpoint, zfs->rootpath) == mountpoint);
return (mountpoint);
}
int
dsl_dir_get_canmount(zfs_dsl_dir_t *dir, uint64_t *canmountp)
{
return (nvlist_find_uint64(dir->propsnv, "canmount", canmountp));
}
/*
* Handle dataset properties that we know about; stash them into an nvlist to be
* written later to the properties ZAP object.
*
* If the set of properties we handle grows too much, we should probably explore
* using libzfs to manage them.
*/
static void
dsl_dir_set_prop(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, const char *key,
const char *val)
{
nvlist_t *nvl;
nvl = dir->propsnv;
if (val == NULL || val[0] == '\0')
errx(1, "missing value for property `%s'", key);
if (nvpair_find(nvl, key) != NULL)
errx(1, "property `%s' already set", key);
if (strcmp(key, "mountpoint") == 0) {
if (strcmp(val, "none") != 0) {
if (val[0] != '/')
errx(1, "mountpoint `%s' is not absolute", val);
if (strcmp(val, zfs->rootpath) != 0 &&
strcmp(zfs->rootpath, "/") != 0 &&
(strstr(val, zfs->rootpath) != val ||
val[strlen(zfs->rootpath)] != '/')) {
errx(1, "mountpoint `%s' is not prefixed by "
"the root path `%s'", val, zfs->rootpath);
}
}
nvlist_add_string(nvl, key, val);
} else if (strcmp(key, "atime") == 0 || strcmp(key, "exec") == 0 ||
strcmp(key, "setuid") == 0) {
if (strcmp(val, "on") == 0)
nvlist_add_uint64(nvl, key, 1);
else if (strcmp(val, "off") == 0)
nvlist_add_uint64(nvl, key, 0);
else
errx(1, "invalid value `%s' for %s", val, key);
} else if (strcmp(key, "canmount") == 0) {
if (strcmp(val, "noauto") == 0)
nvlist_add_uint64(nvl, key, 2);
else if (strcmp(val, "on") == 0)
nvlist_add_uint64(nvl, key, 1);
else if (strcmp(val, "off") == 0)
nvlist_add_uint64(nvl, key, 0);
else
errx(1, "invalid value `%s' for %s", val, key);
} else {
errx(1, "unknown property `%s'", key);
}
}
static zfs_dsl_dir_t *
dsl_metadir_alloc(zfs_opt_t *zfs, const char *name)
{
zfs_dsl_dir_t *dir;
char *path;
easprintf(&path, "%s/%s", zfs->poolname, name);
dir = dsl_dir_alloc(zfs, path);
free(path);
return (dir);
}
static void
dsl_origindir_init(zfs_opt_t *zfs)
{
dnode_phys_t *clones;
uint64_t clonesid;
zfs->origindsldir = dsl_metadir_alloc(zfs, "$ORIGIN");
zfs->originds = dsl_dataset_alloc(zfs, zfs->origindsldir);
zfs->snapds = dsl_dataset_alloc(zfs, zfs->origindsldir);
clones = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_CLONES, &clonesid);
zfs->cloneszap = zap_alloc(zfs->mos, clones);
zfs->origindsldir->phys->dd_clones = clonesid;
}
void
dsl_init(zfs_opt_t *zfs)
{
zfs_dsl_dir_t *dir;
struct dataset_desc *d;
const char *dspropdelim;
dspropdelim = ";";
zfs->rootdsldir = dsl_dir_alloc(zfs, NULL);
nvlist_add_uint64(zfs->rootdsldir->propsnv, "compression",
ZIO_COMPRESS_OFF);
zfs->rootds = dsl_dataset_alloc(zfs, zfs->rootdsldir);
zfs->rootdsldir->headds = zfs->rootds;
zfs->mosdsldir = dsl_metadir_alloc(zfs, "$MOS");
zfs->freedsldir = dsl_metadir_alloc(zfs, "$FREE");
dsl_origindir_init(zfs);
/*
* Go through the list of user-specified datasets and create DSL objects
* for them.
*/
STAILQ_FOREACH(d, &zfs->datasetdescs, next) {
char *dsname, *next, *params, *param, *nextparam;
params = d->params;
dsname = strsep(&params, dspropdelim);
if (strcmp(dsname, zfs->poolname) == 0) {
/*
* This is the root dataset; it's already created, so
* we're just setting options.
*/
dir = zfs->rootdsldir;
} else {
/*
* This dataset must be a child of the root dataset.
*/
if (strstr(dsname, zfs->poolname) != dsname ||
(next = strchr(dsname, '/')) == NULL ||
(size_t)(next - dsname) != strlen(zfs->poolname)) {
errx(1, "dataset `%s' must be a child of `%s'",
dsname, zfs->poolname);
}
dir = dsl_dir_alloc(zfs, dsname);
dir->headds = dsl_dataset_alloc(zfs, dir);
}
for (nextparam = param = params; nextparam != NULL;) {
char *key, *val;
param = strsep(&nextparam, dspropdelim);
key = val = param;
key = strsep(&val, "=");
dsl_dir_set_prop(zfs, dir, key, val);
}
}
/*
* Set the root dataset's mount point if the user didn't override the
* default.
*/
if (nvpair_find(zfs->rootdsldir->propsnv, "mountpoint") == NULL) {
nvlist_add_string(zfs->rootdsldir->propsnv, "mountpoint",
zfs->rootpath);
}
}
uint64_t
dsl_dir_id(zfs_dsl_dir_t *dir)
{
return (dir->dirid);
}
uint64_t
dsl_dir_dataset_id(zfs_dsl_dir_t *dir)
{
return (dir->headds->dsid);
}
static void
dsl_dir_foreach_post(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir,
void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg)
{
zfs_dsl_dir_t *cdsldir;
STAILQ_FOREACH(cdsldir, &dsldir->children, next) {
dsl_dir_foreach_post(zfs, cdsldir, cb, arg);
}
cb(zfs, dsldir, arg);
}
/*
* Used when the caller doesn't care about the order one way or another.
*/
void
dsl_dir_foreach(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir,
void (*cb)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *arg)
{
dsl_dir_foreach_post(zfs, dsldir, cb, arg);
}
const char *
dsl_dir_fullname(const zfs_dsl_dir_t *dir)
{
return (dir->fullname);
}
/*
* Create a DSL directory, which is effectively an entry in the ZFS namespace.
* We always create a root DSL directory, whose name is the pool's name, and
* several metadata directories.
*
* Each directory has two ZAP objects, one pointing to child directories, and
* one for properties (which are inherited by children unless overridden).
* Directories typically reference a DSL dataset, the "head dataset", which
* points to an object set.
*/
static zfs_dsl_dir_t *
dsl_dir_alloc(zfs_opt_t *zfs, const char *name)
{
zfs_dsl_dir_list_t l, *lp;
zfs_dsl_dir_t *dir, *parent;
dnode_phys_t *dnode;
char *dirname, *nextdir, *origname;
uint64_t childid, propsid;
dir = ecalloc(1, sizeof(*dir));
dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DIR,
DMU_OT_DSL_DIR, sizeof(dsl_dir_phys_t), &dir->dirid);
dir->phys = (dsl_dir_phys_t *)DN_BONUS(dnode);
dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_PROPS, &propsid);
dir->propszap = zap_alloc(zfs->mos, dnode);
dnode = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DIR_CHILD_MAP,
&childid);
dir->childzap = zap_alloc(zfs->mos, dnode);
dir->propsnv = nvlist_create(NV_UNIQUE_NAME);
STAILQ_INIT(&dir->children);
dir->phys->dd_child_dir_zapobj = childid;
dir->phys->dd_props_zapobj = propsid;
if (name == NULL) {
/*
* This is the root DSL directory.
*/
dir->name = estrdup(zfs->poolname);
dir->fullname = estrdup(zfs->poolname);
dir->parent = NULL;
dir->phys->dd_parent_obj = 0;
assert(zfs->rootdsldir == NULL);
zfs->rootdsldir = dir;
return (dir);
}
/*
* Insert the new directory into the hierarchy. Currently this must be
* done in order, e.g., when creating pool/a/b, pool/a must already
* exist.
*/
STAILQ_INIT(&l);
STAILQ_INSERT_HEAD(&l, zfs->rootdsldir, next);
origname = dirname = nextdir = estrdup(name);
for (lp = &l;; lp = &parent->children) {
dirname = strsep(&nextdir, "/");
if (nextdir == NULL)
break;
STAILQ_FOREACH(parent, lp, next) {
if (strcmp(parent->name, dirname) == 0)
break;
}
if (parent == NULL) {
errx(1, "no parent at `%s' for filesystem `%s'",
dirname, name);
}
}
dir->fullname = estrdup(name);
dir->name = estrdup(dirname);
free(origname);
STAILQ_INSERT_TAIL(lp, dir, next);
zap_add_uint64(parent->childzap, dir->name, dir->dirid);
dir->parent = parent;
dir->phys->dd_parent_obj = parent->dirid;
return (dir);
}
void
dsl_dir_size_set(zfs_dsl_dir_t *dir, uint64_t bytes)
{
dir->phys->dd_used_bytes = bytes;
dir->phys->dd_compressed_bytes = bytes;
dir->phys->dd_uncompressed_bytes = bytes;
}
/*
* Convert dataset properties into entries in the DSL directory's properties
* ZAP.
*/
static void
dsl_dir_finalize_props(zfs_dsl_dir_t *dir)
{
for (nvp_header_t *nvh = NULL;
(nvh = nvlist_next_nvpair(dir->propsnv, nvh)) != NULL;) {
nv_string_t *nvname;
nv_pair_data_t *nvdata;
const char *name;
nvname = (nv_string_t *)(nvh + 1);
nvdata = (nv_pair_data_t *)(&nvname->nv_data[0] +
NV_ALIGN4(nvname->nv_size));
name = nvstring_get(nvname);
switch (nvdata->nv_type) {
case DATA_TYPE_UINT64: {
uint64_t val;
memcpy(&val, &nvdata->nv_data[0], sizeof(uint64_t));
zap_add_uint64(dir->propszap, name, val);
break;
}
case DATA_TYPE_STRING: {
nv_string_t *nvstr;
nvstr = (nv_string_t *)&nvdata->nv_data[0];
zap_add_string(dir->propszap, name,
nvstring_get(nvstr));
break;
}
default:
assert(0);
}
}
}
static void
dsl_dir_finalize(zfs_opt_t *zfs, zfs_dsl_dir_t *dir, void *arg __unused)
{
char key[32];
zfs_dsl_dir_t *cdir;
dnode_phys_t *snapnames;
zfs_dsl_dataset_t *headds;
zfs_objset_t *os;
uint64_t bytes, snapnamesid;
dsl_dir_finalize_props(dir);
zap_write(zfs, dir->propszap);
zap_write(zfs, dir->childzap);
headds = dir->headds;
if (headds == NULL)
return;
os = headds->os;
if (os == NULL)
return;
snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP,
&snapnamesid);
zap_write(zfs, zap_alloc(zfs->mos, snapnames));
dir->phys->dd_head_dataset_obj = headds->dsid;
dir->phys->dd_clone_parent_obj = zfs->snapds->dsid;
headds->phys->ds_prev_snap_obj = zfs->snapds->dsid;
headds->phys->ds_snapnames_zapobj = snapnamesid;
objset_root_blkptr_copy(os, &headds->phys->ds_bp);
zfs->snapds->phys->ds_num_children++;
snprintf(key, sizeof(key), "%jx", (uintmax_t)headds->dsid);
zap_add_uint64(zfs->cloneszap, key, headds->dsid);
bytes = objset_space(os);
headds->phys->ds_used_bytes = bytes;
headds->phys->ds_uncompressed_bytes = bytes;
headds->phys->ds_compressed_bytes = bytes;
STAILQ_FOREACH(cdir, &dir->children, next)
bytes += cdir->phys->dd_used_bytes;
dsl_dir_size_set(dir, bytes);
}
void
dsl_write(zfs_opt_t *zfs)
{
zfs_zap_t *snapnameszap;
dnode_phys_t *snapnames;
uint64_t snapmapid;
/*
* Perform accounting, starting from the leaves of the DSL directory
* tree. Accounting for $MOS is done later, once we've finished
* allocating space.
*/
dsl_dir_foreach_post(zfs, zfs->rootdsldir, dsl_dir_finalize, NULL);
snapnames = objset_dnode_alloc(zfs->mos, DMU_OT_DSL_DS_SNAP_MAP,
&snapmapid);
snapnameszap = zap_alloc(zfs->mos, snapnames);
zap_add_uint64(snapnameszap, "$ORIGIN", zfs->snapds->dsid);
zap_write(zfs, snapnameszap);
zfs->origindsldir->phys->dd_head_dataset_obj = zfs->originds->dsid;
zfs->originds->phys->ds_prev_snap_obj = zfs->snapds->dsid;
zfs->originds->phys->ds_snapnames_zapobj = snapmapid;
zfs->snapds->phys->ds_next_snap_obj = zfs->originds->dsid;
assert(zfs->snapds->phys->ds_num_children > 0);
zfs->snapds->phys->ds_num_children++;
zap_write(zfs, zfs->cloneszap);
/* XXX-MJ dirs and datasets are leaked */
}
void
dsl_dir_dataset_write(zfs_opt_t *zfs, zfs_objset_t *os, zfs_dsl_dir_t *dir)
{
dir->headds->os = os;
objset_write(zfs, os);
}
bool
dsl_dir_has_dataset(zfs_dsl_dir_t *dir)
{
return (dir->headds != NULL);
}
bool
dsl_dir_dataset_has_objset(zfs_dsl_dir_t *dir)
{
return (dsl_dir_has_dataset(dir) && dir->headds->os != NULL);
}
static zfs_dsl_dataset_t *
dsl_dataset_alloc(zfs_opt_t *zfs, zfs_dsl_dir_t *dir)
{
zfs_dsl_dataset_t *ds;
dnode_phys_t *dnode;
uint64_t deadlistid;
ds = ecalloc(1, sizeof(*ds));
dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DSL_DATASET,
DMU_OT_DSL_DATASET, sizeof(dsl_dataset_phys_t), &ds->dsid);
ds->phys = (dsl_dataset_phys_t *)DN_BONUS(dnode);
dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_DEADLIST,
DMU_OT_DEADLIST_HDR, sizeof(dsl_deadlist_phys_t), &deadlistid);
zap_write(zfs, zap_alloc(zfs->mos, dnode));
ds->phys->ds_dir_obj = dir->dirid;
ds->phys->ds_deadlist_obj = deadlistid;
ds->phys->ds_creation_txg = TXG - 1;
if (ds != zfs->snapds)
ds->phys->ds_prev_snap_txg = TXG - 1;
ds->phys->ds_guid = ((uint64_t)random() << 32) | random();
ds->dir = dir;
return (ds);
}

981
usr.sbin/makefs/zfs/fs.c Normal file
View File

@ -0,0 +1,981 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2022 The FreeBSD Foundation
*
* This software was developed by Mark Johnston under sponsorship from
* the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/dirent.h>
#include <sys/stat.h>
#include <assert.h>
#include <fcntl.h>
#include <string.h>
#include <unistd.h>
#include <util.h>
#include "makefs.h"
#include "zfs.h"
typedef struct {
const char *name;
unsigned int id;
uint16_t size;
sa_bswap_type_t bs;
} zfs_sattr_t;
typedef struct zfs_fs {
zfs_objset_t *os;
/* Offset table for system attributes, indexed by a zpl_attr_t. */
uint16_t *saoffs;
size_t sacnt;
const zfs_sattr_t *satab;
} zfs_fs_t;
/*
* The order of the attributes doesn't matter, this is simply the one hard-coded
* by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
*/
typedef enum zpl_attr {
ZPL_ATIME,
ZPL_MTIME,
ZPL_CTIME,
ZPL_CRTIME,
ZPL_GEN,
ZPL_MODE,
ZPL_SIZE,
ZPL_PARENT,
ZPL_LINKS,
ZPL_XATTR,
ZPL_RDEV,
ZPL_FLAGS,
ZPL_UID,
ZPL_GID,
ZPL_PAD,
ZPL_ZNODE_ACL,
ZPL_DACL_COUNT,
ZPL_SYMLINK,
ZPL_SCANSTAMP,
ZPL_DACL_ACES,
ZPL_DXATTR,
ZPL_PROJID,
} zpl_attr_t;
/*
* This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
*/
static const zfs_sattr_t zpl_attrs[] = {
#define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b }
_ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
_ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
_ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
_ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
_ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
_ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
#undef ZPL_ATTR
};
/*
* This layout matches that of a filesystem created using OpenZFS on FreeBSD.
* It need not match in general, but FreeBSD's loader doesn't bother parsing the
* layout and just hard-codes attribute offsets.
*/
static const sa_attr_type_t zpl_attr_layout[] = {
ZPL_MODE,
ZPL_SIZE,
ZPL_GEN,
ZPL_UID,
ZPL_GID,
ZPL_PARENT,
ZPL_FLAGS,
ZPL_ATIME,
ZPL_MTIME,
ZPL_CTIME,
ZPL_CRTIME,
ZPL_LINKS,
ZPL_DACL_COUNT,
ZPL_DACL_ACES,
ZPL_SYMLINK,
};
/*
* Keys for the ZPL attribute tables in the SA layout ZAP. The first two
* indices are reserved for legacy attribute encoding.
*/
#define SA_LAYOUT_INDEX_DEFAULT 2
#define SA_LAYOUT_INDEX_SYMLINK 3
struct fs_populate_dir {
SLIST_ENTRY(fs_populate_dir) next;
int dirfd;
uint64_t objid;
zfs_zap_t *zap;
};
struct fs_populate_arg {
zfs_opt_t *zfs;
zfs_fs_t *fs; /* owning filesystem */
int dirfd; /* current directory fd */
uint64_t rootdirid; /* root directory dnode ID */
SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */
};
static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int);
static bool
fsnode_isroot(const fsnode *cur)
{
return (strcmp(cur->name, ".") == 0);
}
/*
* Visit each node in a directory hierarchy, in pre-order depth-first order.
*/
static void
fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg)
{
assert(root->type == S_IFDIR);
for (fsnode *cur = root; cur != NULL; cur = cur->next) {
assert(cur->type == S_IFREG || cur->type == S_IFDIR ||
cur->type == S_IFLNK);
if (cb(cur, arg) == 0)
continue;
if (cur->type == S_IFDIR && cur->child != NULL)
fsnode_foreach(cur->child, cb, arg);
}
}
static void
fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid)
{
struct fs_populate_dir *dir;
uint64_t type;
switch (cur->type) {
case S_IFREG:
type = DT_REG;
break;
case S_IFDIR:
type = DT_DIR;
break;
case S_IFLNK:
type = DT_LNK;
break;
default:
assert(0);
}
dir = SLIST_FIRST(&arg->dirs);
zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
}
static void
fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind,
size_t *szp)
{
assert(ind < fs->sacnt);
assert(fs->saoffs[ind] != 0xffff);
memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
*szp += fs->satab[ind].size;
}
static void
fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val,
size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
{
assert(ind < fs->sacnt);
assert(fs->saoffs[ind] != 0xffff);
assert(fs->satab[ind].size == 0);
memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
*szp += valsz;
}
static void
fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur,
dnode_phys_t *dnode)
{
char target[PATH_MAX];
zfs_fs_t *fs;
zfs_ace_hdr_t aces[3];
struct stat *sb;
sa_hdr_phys_t *sahdr;
uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
char *attrbuf;
size_t bonussz, hdrsz;
int layout;
assert(dnode->dn_bonustype == DMU_OT_SA);
assert(dnode->dn_nblkptr == 1);
fs = arg->fs;
sb = &cur->inode->st;
switch (cur->type) {
case S_IFREG:
layout = SA_LAYOUT_INDEX_DEFAULT;
links = cur->inode->nlink;
objsize = sb->st_size;
parent = SLIST_FIRST(&arg->dirs)->objid;
break;
case S_IFDIR:
layout = SA_LAYOUT_INDEX_DEFAULT;
links = 1; /* .. */
objsize = 1; /* .. */
/*
* The size of a ZPL directory is the number of entries
* (including "." and ".."), and the link count is the number of
* entries which are directories (including "." and "..").
*/
for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child;
c != NULL; c = c->next) {
if (c->type == S_IFDIR)
links++;
objsize++;
}
/* The root directory is its own parent. */
parent = SLIST_EMPTY(&arg->dirs) ?
arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
break;
case S_IFLNK: {
ssize_t n;
if ((n = readlinkat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name,
target, sizeof(target) - 1)) == -1)
err(1, "readlinkat(%s)", cur->name);
target[n] = '\0';
layout = SA_LAYOUT_INDEX_SYMLINK;
links = 1;
objsize = strlen(target);
parent = SLIST_FIRST(&arg->dirs)->objid;
break;
}
default:
assert(0);
}
daclcount = nitems(aces);
flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED |
ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */
gen = 1;
gid = sb->st_gid;
mode = sb->st_mode;
uid = sb->st_uid;
memset(aces, 0, sizeof(aces));
aces[0].z_flags = ACE_OWNER;
aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER |
ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL |
ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
if ((mode & S_IRUSR) != 0)
aces[0].z_access_mask |= ACE_READ_DATA;
if ((mode & S_IWUSR) != 0)
aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
if ((mode & S_IXUSR) != 0)
aces[0].z_access_mask |= ACE_EXECUTE;
aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP;
aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
if ((mode & S_IRGRP) != 0)
aces[1].z_access_mask |= ACE_READ_DATA;
if ((mode & S_IWGRP) != 0)
aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
if ((mode & S_IXGRP) != 0)
aces[1].z_access_mask |= ACE_EXECUTE;
aces[2].z_flags = ACE_EVERYONE;
aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
if ((mode & S_IROTH) != 0)
aces[2].z_access_mask |= ACE_READ_DATA;
if ((mode & S_IWOTH) != 0)
aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
if ((mode & S_IXOTH) != 0)
aces[2].z_access_mask |= ACE_EXECUTE;
switch (layout) {
case SA_LAYOUT_INDEX_DEFAULT:
/* At most one variable-length attribute. */
hdrsz = sizeof(uint64_t);
break;
case SA_LAYOUT_INDEX_SYMLINK:
/* At most five variable-length attributes. */
hdrsz = sizeof(uint64_t) * 2;
break;
default:
assert(0);
}
sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
sahdr->sa_magic = SA_MAGIC;
SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
bonussz = SA_HDR_SIZE(sahdr);
attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
/*
* We deliberately set atime = mtime here to ensure that images are
* reproducible.
*/
assert(sizeof(sb->st_mtim) == fs->satab[ZPL_ATIME].size);
fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
assert(sizeof(sb->st_ctim) == fs->satab[ZPL_CTIME].size);
fs_populate_attr(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
assert(sizeof(sb->st_mtim) == fs->satab[ZPL_MTIME].size);
fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
assert(sizeof(sb->st_birthtim) == fs->satab[ZPL_CRTIME].size);
fs_populate_attr(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
ZPL_DACL_ACES, &bonussz);
sahdr->sa_lengths[0] = sizeof(aces);
if (cur->type == S_IFLNK) {
assert(layout == SA_LAYOUT_INDEX_SYMLINK);
/* Need to use a spill block pointer if the target is long. */
assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
fs_populate_varszattr(fs, attrbuf, target, objsize,
sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
sahdr->sa_lengths[1] = (uint16_t)objsize;
}
dnode->dn_bonuslen = bonussz;
}
static void
fs_populate_file(fsnode *cur, struct fs_populate_arg *arg)
{
struct dnode_cursor *c;
dnode_phys_t *dnode;
zfs_opt_t *zfs;
char *buf;
uint64_t dnid;
ssize_t n;
size_t bufsz;
off_t size, target;
int fd;
assert(cur->type == S_IFREG);
assert((cur->inode->flags & FI_ROOT) == 0);
zfs = arg->zfs;
assert(cur->inode->ino != 0);
if ((cur->inode->flags & FI_ALLOCATED) != 0) {
/*
* This is a hard link of an existing file.
*
* XXX-MJ need to check whether it crosses datasets, add a test
* case for that
*/
fs_populate_dirent(arg, cur, cur->inode->ino);
return;
}
dnode = objset_dnode_bonus_alloc(arg->fs->os,
DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
cur->inode->ino = dnid;
cur->inode->flags |= FI_ALLOCATED;
fd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name, O_RDONLY);
if (fd == -1)
err(1, "openat(%s)", cur->name);
buf = zfs->filebuf;
bufsz = sizeof(zfs->filebuf);
size = cur->inode->st.st_size;
c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
for (off_t foff = 0; foff < size; foff += target) {
off_t loc, sofar;
/*
* Fill up our buffer, handling partial reads.
*
* It might be profitable to use copy_file_range(2) here.
*/
sofar = 0;
target = MIN(size - foff, (off_t)bufsz);
do {
n = read(fd, buf + sofar, target);
if (n < 0)
err(1, "reading from '%s'", cur->name);
if (n == 0)
errx(1, "unexpected EOF reading '%s'",
cur->name);
sofar += n;
} while (sofar < target);
if (target < (off_t)bufsz)
memset(buf + target, 0, bufsz - target);
loc = objset_space_alloc(zfs, arg->fs->os, &target);
vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, target, loc,
dnode_cursor_next(zfs, c, foff));
}
if (close(fd) != 0)
err(1, "close");
dnode_cursor_finish(zfs, c);
fs_populate_sattrs(arg, cur, dnode);
fs_populate_dirent(arg, cur, dnid);
}
static void
fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg)
{
dnode_phys_t *dnode;
zfs_objset_t *os;
uint64_t dnid;
int dirfd;
assert(cur->type == S_IFDIR);
assert((cur->inode->flags & FI_ALLOCATED) == 0);
os = arg->fs->os;
dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
DMU_OT_SA, 0, &dnid);
/*
* Add an entry to the parent directory and open this directory.
*/
if (!SLIST_EMPTY(&arg->dirs)) {
fs_populate_dirent(arg, cur, dnid);
dirfd = openat(SLIST_FIRST(&arg->dirs)->dirfd, cur->name,
O_DIRECTORY);
if (dirfd < 0)
err(1, "open(%s)", cur->name);
} else {
arg->rootdirid = dnid;
dirfd = arg->dirfd;
}
/*
* Set ZPL attributes.
*/
fs_populate_sattrs(arg, cur, dnode);
/*
* If this is a root directory, then its children belong to a different
* dataset and this directory remains empty in the current objset.
*/
if ((cur->inode->flags & FI_ROOT) == 0) {
struct fs_populate_dir *dir;
dir = ecalloc(1, sizeof(*dir));
dir->dirfd = dirfd;
dir->objid = dnid;
dir->zap = zap_alloc(os, dnode);
SLIST_INSERT_HEAD(&arg->dirs, dir, next);
} else {
zap_write(arg->zfs, zap_alloc(os, dnode));
fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
}
}
static void
fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg)
{
dnode_phys_t *dnode;
uint64_t dnid;
assert(cur->type == S_IFLNK);
assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0);
dnode = objset_dnode_bonus_alloc(arg->fs->os,
DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
fs_populate_dirent(arg, cur, dnid);
fs_populate_sattrs(arg, cur, dnode);
}
static int
fs_foreach_populate(fsnode *cur, void *_arg)
{
struct fs_populate_arg *arg;
struct fs_populate_dir *dir;
int ret;
arg = _arg;
switch (cur->type) {
case S_IFREG:
fs_populate_file(cur, arg);
break;
case S_IFDIR:
if (fsnode_isroot(cur))
break;
fs_populate_dir(cur, arg);
break;
case S_IFLNK:
fs_populate_symlink(cur, arg);
break;
default:
assert(0);
}
ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
if (cur->next == NULL &&
(cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) {
/*
* We reached a terminal node in a subtree. Walk back up and
* write out directories. We're done once we hit the root of a
* dataset or find a level where we're not on the edge of the
* tree.
*/
do {
dir = SLIST_FIRST(&arg->dirs);
SLIST_REMOVE_HEAD(&arg->dirs, next);
zap_write(arg->zfs, dir->zap);
if (dir->dirfd != -1 && close(dir->dirfd) != 0)
err(1, "close");
free(dir);
cur = cur->parent;
} while (cur != NULL && cur->next == NULL &&
(cur->inode->flags & FI_ROOT) == 0);
}
return (ret);
}
static void
fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
const sa_attr_type_t layout[], size_t sacnt)
{
char ti[16];
assert(sizeof(layout[0]) == 2);
snprintf(ti, sizeof(ti), "%u", index);
zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
(const uint8_t *)layout);
}
/*
* Initialize system attribute tables.
*
* There are two elements to this. First, we write the zpl_attrs[] and
* zpl_attr_layout[] tables to disk. Then we create a lookup table which
* allows us to set file attributes quickly.
*/
static uint64_t
fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs)
{
zfs_zap_t *sazap, *salzap, *sarzap;
zfs_objset_t *os;
dnode_phys_t *saobj, *salobj, *sarobj;
uint64_t saobjid, salobjid, sarobjid;
uint16_t offset;
os = fs->os;
/*
* The on-disk tables are stored in two ZAP objects, the registry object
* and the layout object. Individual attributes are described by
* entries in the registry object; for example, the value for the
* "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
* The attributes of a file are ordered according to one of the layouts
* defined in the layout object. The master node object is simply used
* to locate the registry and layout objects.
*/
saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
sarzap = zap_alloc(os, sarobj);
for (size_t i = 0; i < nitems(zpl_attrs); i++) {
const zfs_sattr_t *sa;
uint64_t attr;
attr = 0;
sa = &zpl_attrs[i];
SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
zap_add_uint64(sarzap, sa->name, attr);
}
zap_write(zfs, sarzap);
/*
* Layouts are arrays of indices into the registry. We define two
* layouts for use by the ZPL, one for non-symlinks and one for
* symlinks. They are identical except that the symlink layout includes
* ZPL_SYMLINK as its final attribute.
*/
salzap = zap_alloc(os, salobj);
assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT,
zpl_attr_layout, nitems(zpl_attr_layout) - 1);
fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK,
zpl_attr_layout, nitems(zpl_attr_layout));
zap_write(zfs, salzap);
sazap = zap_alloc(os, saobj);
zap_add_uint64(sazap, SA_LAYOUTS, salobjid);
zap_add_uint64(sazap, SA_REGISTRY, sarobjid);
zap_write(zfs, sazap);
/* Sanity check. */
for (size_t i = 0; i < nitems(zpl_attrs); i++)
assert(i == zpl_attrs[i].id);
/*
* Build the offset table used when setting file attributes. File
* attributes are stored in the object's bonus buffer; this table
* provides the buffer offset of attributes referenced by the layout
* table.
*/
fs->sacnt = nitems(zpl_attrs);
fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
for (size_t i = 0; i < fs->sacnt; i++)
fs->saoffs[i] = 0xffff;
offset = 0;
for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
uint16_t size;
assert(zpl_attr_layout[i] < fs->sacnt);
fs->saoffs[zpl_attr_layout[i]] = offset;
size = zpl_attrs[zpl_attr_layout[i]].size;
offset += size;
}
fs->satab = zpl_attrs;
return (saobjid);
}
static void
fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg)
{
char *mountpoint, *origmountpoint, *name, *next;
fsnode *cur, *root;
uint64_t canmount;
if (!dsl_dir_has_dataset(dsldir))
return;
mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
if (mountpoint == NULL)
return;
if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0)
return;
/*
* If we were asked to specify a bootfs, set it here.
*/
if (zfs->bootfs != NULL && strcmp(zfs->bootfs,
dsl_dir_fullname(dsldir)) == 0) {
zap_add_uint64(zfs->poolprops, "bootfs",
dsl_dir_dataset_id(dsldir));
}
origmountpoint = mountpoint;
/*
* Figure out which fsnode corresponds to our mountpoint.
*/
root = arg;
cur = root;
if (strcmp(mountpoint, zfs->rootpath) != 0) {
mountpoint += strlen(zfs->rootpath);
/*
* Look up the directory in the staged tree. For example, if
* the dataset's mount point is /foo/bar/baz, we'll search the
* root directory for "foo", search "foo" for "baz", and so on.
* Each intermediate name must refer to a directory; the final
* component need not exist.
*/
cur = root;
for (next = name = mountpoint; next != NULL;) {
for (; *next == '/'; next++)
;
name = strsep(&next, "/");
for (; cur != NULL && strcmp(cur->name, name) != 0;
cur = cur->next)
;
if (cur == NULL) {
if (next == NULL)
break;
errx(1, "missing mountpoint directory for `%s'",
dsl_dir_fullname(dsldir));
}
if (cur->type != S_IFDIR) {
errx(1,
"mountpoint for `%s' is not a directory",
dsl_dir_fullname(dsldir));
}
if (next != NULL)
cur = cur->child;
}
}
if (cur != NULL) {
assert(cur->type == S_IFDIR);
/*
* Multiple datasets shouldn't share a mountpoint. It's
* technically allowed, but it's not clear what makefs should do
* in that case.
*/
assert((cur->inode->flags & FI_ROOT) == 0);
if (cur != root)
cur->inode->flags |= FI_ROOT;
assert(cur->inode->param == NULL);
cur->inode->param = dsldir;
}
free(origmountpoint);
}
static int
fs_foreach_mark(fsnode *cur, void *arg)
{
uint64_t *countp;
countp = arg;
if (cur->type == S_IFDIR && fsnode_isroot(cur))
return (1);
if (cur->inode->ino == 0) {
cur->inode->ino = ++(*countp);
cur->inode->nlink = 1;
} else {
cur->inode->nlink++;
}
return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
}
/*
* Create a filesystem dataset. More specifically:
* - create an object set for the dataset,
* - add required metadata (SA tables, property definitions, etc.) to that
* object set,
* - optionally populate the object set with file objects, using "root" as the
* root directory.
*
* "dirfd" is a directory descriptor for the directory referenced by "root". It
* is closed before returning.
*/
static void
fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd)
{
struct fs_populate_arg arg;
zfs_fs_t fs;
zfs_zap_t *masterzap;
zfs_objset_t *os;
dnode_phys_t *deleteq, *masterobj;
uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
bool fakedroot;
/*
* This dataset's mountpoint doesn't exist in the staging tree, or the
* dataset doesn't have a mountpoint at all. In either case we still
* need a root directory. Fake up a root fsnode to handle this case.
*/
fakedroot = root == NULL;
if (fakedroot) {
struct stat *stp;
assert(dirfd == -1);
root = ecalloc(1, sizeof(*root));
root->inode = ecalloc(1, sizeof(*root->inode));
root->name = estrdup(".");
root->type = S_IFDIR;
stp = &root->inode->st;
stp->st_uid = 0;
stp->st_gid = 0;
stp->st_mode = S_IFDIR | 0755;
}
assert(root->type == S_IFDIR);
assert(fsnode_isroot(root));
/*
* Initialize the object set for this dataset.
*/
os = objset_alloc(zfs, DMU_OST_ZFS);
masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
assert(moid == MASTER_NODE_OBJ);
memset(&fs, 0, sizeof(fs));
fs.os = os;
/*
* Create the ZAP SA layout now since filesystem object dnodes will
* refer to those attributes.
*/
saobjid = fs_set_zpl_attrs(zfs, &fs);
/*
* Make a pass over the staged directory to detect hard links and assign
* virtual dnode numbers.
*/
dnodecount = 1; /* root directory */
fsnode_foreach(root, fs_foreach_mark, &dnodecount);
/*
* Make a second pass to populate the dataset with files from the
* staged directory. Most of our runtime is spent here.
*/
arg.dirfd = dirfd;
arg.zfs = zfs;
arg.fs = &fs;
SLIST_INIT(&arg.dirs);
fs_populate_dir(root, &arg);
assert(!SLIST_EMPTY(&arg.dirs));
fsnode_foreach(root, fs_foreach_populate, &arg);
assert(SLIST_EMPTY(&arg.dirs));
rootdirid = arg.rootdirid;
/*
* Create an empty delete queue. We don't do anything with it, but
* OpenZFS will refuse to mount filesystems that don't have one.
*/
deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
zap_write(zfs, zap_alloc(os, deleteq));
/*
* Populate and write the master node object. This is a ZAP object
* containing various dataset properties and the object IDs of the root
* directory and delete queue.
*/
masterzap = zap_alloc(os, masterobj);
zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid);
zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid);
zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid);
zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
zap_add_uint64(masterzap, "normalization", 0 /* off */);
zap_add_uint64(masterzap, "utf8only", 0 /* off */);
zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */);
zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */);
zap_write(zfs, masterzap);
/*
* All finished with this object set, we may as well write it now.
* The DSL layer will sum up the bytes consumed by each dataset using
* information stored in the object set, so it can't be freed just yet.
*/
dsl_dir_dataset_write(zfs, os, dsldir);
if (fakedroot) {
free(root->inode);
free(root->name);
free(root);
}
free(fs.saoffs);
}
/*
* Create an object set for each DSL directory which has a dataset and doesn't
* already have an object set.
*/
static void
fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused)
{
if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir))
fs_build_one(zfs, dsldir, NULL, -1);
}
/*
* Create our datasets and populate them with files.
*/
void
fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root)
{
/*
* Run through our datasets and find the root fsnode for each one. Each
* root fsnode is flagged so that we can figure out which dataset it
* belongs to.
*/
dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root);
/*
* Did we find our boot filesystem?
*/
if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs"))
errx(1, "no mounted dataset matches bootfs property `%s'",
zfs->bootfs);
/*
* Traverse the file hierarchy starting from the root fsnode. One
* dataset, not necessarily the root dataset, must "own" the root
* directory by having its mountpoint be equal to the root path.
*
* As roots of other datasets are encountered during the traversal,
* fs_build_one() recursively creates the corresponding object sets and
* populates them. Once this function has returned, all datasets will
* have been fully populated.
*/
fs_build_one(zfs, root->inode->param, root, dirfd);
/*
* Now create object sets for datasets whose mountpoints weren't found
* in the staging directory, either because there is no mountpoint, or
* because the mountpoint doesn't correspond to an existing directory.
*/
dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL);
}

View File

@ -0,0 +1,259 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2022 The FreeBSD Foundation
*
* This software was developed by Mark Johnston under sponsorship from
* the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <assert.h>
#include <string.h>
#include <util.h>
#include "zfs.h"
#define DNODES_PER_CHUNK (MAXBLOCKSIZE / sizeof(dnode_phys_t))
struct objset_dnode_chunk {
dnode_phys_t buf[DNODES_PER_CHUNK];
unsigned int nextfree;
STAILQ_ENTRY(objset_dnode_chunk) next;
};
typedef struct zfs_objset {
/* Physical object set. */
objset_phys_t *phys;
off_t osloc;
off_t osblksz;
blkptr_t osbp; /* set in objset_write() */
/* Accounting. */
off_t space; /* bytes allocated to this objset */
/* dnode allocator. */
uint64_t dnodecount;
STAILQ_HEAD(, objset_dnode_chunk) dnodechunks;
} zfs_objset_t;
static void
dnode_init(dnode_phys_t *dnode, uint8_t type, uint8_t bonustype,
uint16_t bonuslen)
{
dnode->dn_indblkshift = MAXBLOCKSHIFT;
dnode->dn_type = type;
dnode->dn_bonustype = bonustype;
dnode->dn_bonuslen = bonuslen;
dnode->dn_checksum = ZIO_CHECKSUM_FLETCHER_4;
dnode->dn_nlevels = 1;
dnode->dn_nblkptr = 1;
dnode->dn_flags = DNODE_FLAG_USED_BYTES;
}
zfs_objset_t *
objset_alloc(zfs_opt_t *zfs, uint64_t type)
{
struct objset_dnode_chunk *chunk;
zfs_objset_t *os;
os = ecalloc(1, sizeof(*os));
os->osblksz = sizeof(objset_phys_t);
os->osloc = objset_space_alloc(zfs, os, &os->osblksz);
/*
* Object ID zero is always reserved for the meta dnode, which is
* embedded in the objset itself.
*/
STAILQ_INIT(&os->dnodechunks);
chunk = ecalloc(1, sizeof(*chunk));
chunk->nextfree = 1;
STAILQ_INSERT_HEAD(&os->dnodechunks, chunk, next);
os->dnodecount = 1;
os->phys = ecalloc(1, os->osblksz);
os->phys->os_type = type;
dnode_init(&os->phys->os_meta_dnode, DMU_OT_DNODE, DMU_OT_NONE, 0);
os->phys->os_meta_dnode.dn_datablkszsec =
DNODE_BLOCK_SIZE >> MINBLOCKSHIFT;
return (os);
}
/*
* Write the dnode array and physical object set to disk.
*/
static void
_objset_write(zfs_opt_t *zfs, zfs_objset_t *os, struct dnode_cursor *c,
off_t loc)
{
struct objset_dnode_chunk *chunk, *tmp;
unsigned int total;
/*
* Write out the dnode array, i.e., the meta-dnode. For some reason its
* data blocks must be 16KB in size no matter how large the array is.
*/
total = 0;
STAILQ_FOREACH_SAFE(chunk, &os->dnodechunks, next, tmp) {
unsigned int i;
assert(chunk->nextfree <= os->dnodecount);
assert(chunk->nextfree <= DNODES_PER_CHUNK);
for (i = 0; i < chunk->nextfree; i += DNODES_PER_BLOCK) {
blkptr_t *bp;
uint64_t fill;
if (chunk->nextfree - i < DNODES_PER_BLOCK)
fill = DNODES_PER_BLOCK - (chunk->nextfree - i);
else
fill = 0;
bp = dnode_cursor_next(zfs, c,
(total + i) * sizeof(dnode_phys_t));
vdev_pwrite_dnode_indir(zfs, &os->phys->os_meta_dnode,
0, fill, chunk->buf + i, DNODE_BLOCK_SIZE, loc, bp);
loc += DNODE_BLOCK_SIZE;
}
total += i;
free(chunk);
}
dnode_cursor_finish(zfs, c);
STAILQ_INIT(&os->dnodechunks);
/*
* Write the object set itself. The saved block pointer will be copied
* into the referencing DSL dataset or the uberblocks.
*/
vdev_pwrite_data(zfs, DMU_OT_OBJSET, ZIO_CHECKSUM_FLETCHER_4, 0, 1,
os->phys, os->osblksz, os->osloc, &os->osbp);
}
void
objset_write(zfs_opt_t *zfs, zfs_objset_t *os)
{
struct dnode_cursor *c;
off_t dnodeloc, dnodesz;
uint64_t dnodecount;
/*
* There is a chicken-and-egg problem here when writing the MOS: we
* cannot write space maps before we're finished allocating space from
* the vdev, and we can't write the MOS without having allocated space
* for indirect dnode blocks. Thus, rather than lazily allocating
* indirect blocks for the meta-dnode (which would be simpler), they are
* allocated up-front and before writing space maps.
*/
dnodecount = os->dnodecount;
if (os == zfs->mos)
dnodecount += zfs->mscount;
dnodesz = dnodecount * sizeof(dnode_phys_t);
c = dnode_cursor_init(zfs, os, &os->phys->os_meta_dnode, dnodesz,
DNODE_BLOCK_SIZE);
dnodesz = roundup2(dnodesz, DNODE_BLOCK_SIZE);
dnodeloc = objset_space_alloc(zfs, os, &dnodesz);
if (os == zfs->mos) {
vdev_spacemap_write(zfs);
/*
* We've finished allocating space, account for it in $MOS.
*/
dsl_dir_size_set(zfs->mosdsldir, os->space);
}
_objset_write(zfs, os, c, dnodeloc);
}
dnode_phys_t *
objset_dnode_bonus_alloc(zfs_objset_t *os, uint8_t type, uint8_t bonustype,
uint16_t bonuslen, uint64_t *idp)
{
struct objset_dnode_chunk *chunk;
dnode_phys_t *dnode;
assert(bonuslen <= DN_OLD_MAX_BONUSLEN);
assert(!STAILQ_EMPTY(&os->dnodechunks));
chunk = STAILQ_LAST(&os->dnodechunks, objset_dnode_chunk, next);
if (chunk->nextfree == DNODES_PER_CHUNK) {
chunk = ecalloc(1, sizeof(*chunk));
STAILQ_INSERT_TAIL(&os->dnodechunks, chunk, next);
}
*idp = os->dnodecount++;
dnode = &chunk->buf[chunk->nextfree++];
dnode_init(dnode, type, bonustype, bonuslen);
dnode->dn_datablkszsec = os->osblksz >> MINBLOCKSHIFT;
return (dnode);
}
dnode_phys_t *
objset_dnode_alloc(zfs_objset_t *os, uint8_t type, uint64_t *idp)
{
return (objset_dnode_bonus_alloc(os, type, DMU_OT_NONE, 0, idp));
}
/*
* Look up a physical dnode by ID. This is not used often so a linear search is
* fine.
*/
dnode_phys_t *
objset_dnode_lookup(zfs_objset_t *os, uint64_t id)
{
struct objset_dnode_chunk *chunk;
assert(id > 0);
assert(id < os->dnodecount);
STAILQ_FOREACH(chunk, &os->dnodechunks, next) {
if (id < DNODES_PER_CHUNK)
return (&chunk->buf[id]);
id -= DNODES_PER_CHUNK;
}
assert(0);
return (NULL);
}
off_t
objset_space_alloc(zfs_opt_t *zfs, zfs_objset_t *os, off_t *lenp)
{
off_t loc;
loc = vdev_space_alloc(zfs, lenp);
os->space += *lenp;
return (loc);
}
uint64_t
objset_space(const zfs_objset_t *os)
{
return (os->space);
}
void
objset_root_blkptr_copy(const zfs_objset_t *os, blkptr_t *bp)
{
memcpy(bp, &os->osbp, sizeof(blkptr_t));
}

435
usr.sbin/makefs/zfs/vdev.c Normal file
View File

@ -0,0 +1,435 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2022 The FreeBSD Foundation
*
* This software was developed by Mark Johnston under sponsorship from
* the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <assert.h>
#include <fcntl.h>
#include <string.h>
#include <unistd.h>
#include <util.h>
#include "zfs.h"
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-function"
#include "zfs/fletcher.c"
#include "zfs/sha256.c"
#pragma clang diagnostic pop
static void
blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level,
uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum)
{
dva_t *dva;
assert(powerof2(size));
BP_ZERO(bp);
BP_SET_LSIZE(bp, size);
BP_SET_PSIZE(bp, size);
BP_SET_CHECKSUM(bp, cksumt);
BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
BP_SET_BIRTH(bp, TXG, TXG);
BP_SET_LEVEL(bp, level);
BP_SET_FILL(bp, fill);
BP_SET_TYPE(bp, dntype);
dva = BP_IDENTITY(bp);
DVA_SET_VDEV(dva, 0);
DVA_SET_OFFSET(dva, off);
DVA_SET_ASIZE(dva, size);
memcpy(&bp->blk_cksum, cksum, sizeof(*cksum));
}
/*
* Write a block of data to the vdev. The offset is always relative to the end
* of the second leading vdev label.
*
* Consumers should generally use the helpers below, which provide block
* pointers and update dnode accounting, rather than calling this function
* directly.
*/
static void
vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off)
{
ssize_t n;
assert(off >= 0 && off < zfs->asize);
assert(powerof2(len));
assert((off_t)len > 0 && off + (off_t)len > off &&
off + (off_t)len < zfs->asize);
if (zfs->spacemap != NULL) {
/*
* Verify that the blocks being written were in fact allocated.
*
* The space map isn't available once the on-disk space map is
* finalized, so this check doesn't quite catch everything.
*/
assert(bit_ntest(zfs->spacemap, off >> zfs->ashift,
(off + len - 1) >> zfs->ashift, 1));
}
off += VDEV_LABEL_START_SIZE;
for (size_t sofar = 0; sofar < len; sofar += n) {
n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar,
off + sofar);
if (n < 0)
err(1, "pwrite");
assert(n > 0);
}
}
void
vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
blkptr_t *bp)
{
zio_cksum_t cksum;
assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4);
fletcher_4_native(data, sz, NULL, &cksum);
blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum);
vdev_pwrite(zfs, data, sz, loc);
}
void
vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level,
uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp)
{
vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill,
data, sz, loc, bp);
assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0);
dnode->dn_used += sz;
}
void
vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data,
off_t sz, off_t loc)
{
vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc,
&dnode->dn_blkptr[0]);
}
static void
vdev_label_set_checksum(void *buf, off_t off, off_t size)
{
zio_cksum_t cksum;
zio_eck_t *eck;
assert(size > 0 && (size_t)size >= sizeof(zio_eck_t));
eck = (zio_eck_t *)((char *)buf + size) - 1;
eck->zec_magic = ZEC_MAGIC;
ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0);
zio_checksum_SHA256(buf, size, NULL, &cksum);
eck->zec_cksum = cksum;
}
/*
* Set embedded checksums and write the label at the specified index.
*/
void
vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp)
{
vdev_label_t *label;
ssize_t n;
off_t blksz, loff;
assert(ind >= 0 && ind < VDEV_LABELS);
/*
* Make a copy since we have to modify the label to set checksums.
*/
label = ecalloc(1, sizeof(*label));
memcpy(label, labelp, sizeof(*label));
if (ind < 2)
loff = ind * sizeof(*label);
else
loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label);
/*
* Set the verifier checksum for the boot block. We don't use it, but
* the FreeBSD loader reads it and will complain if the checksum isn't
* valid.
*/
vdev_label_set_checksum(&label->vl_be,
loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be));
/*
* Set the verifier checksum for the label.
*/
vdev_label_set_checksum(&label->vl_vdev_phys,
loff + __offsetof(vdev_label_t, vl_vdev_phys),
sizeof(label->vl_vdev_phys));
/*
* Set the verifier checksum for the uberblocks. There is one uberblock
* per sector; for example, with an ashift of 12 we end up with
* 128KB/4KB=32 copies of the uberblock in the ring.
*/
blksz = 1 << zfs->ashift;
assert(sizeof(label->vl_uberblock) % blksz == 0);
for (size_t roff = 0; roff < sizeof(label->vl_uberblock);
roff += blksz) {
vdev_label_set_checksum(&label->vl_uberblock[0] + roff,
loff + __offsetof(vdev_label_t, vl_uberblock) + roff,
blksz);
}
n = pwrite(zfs->fd, label, sizeof(*label), loff);
if (n < 0)
err(1, "writing vdev label");
assert(n == sizeof(*label));
free(label);
}
/*
* Find a chunk of contiguous free space of length *lenp, according to the
* following rules:
* 1. If the length is less than or equal to 128KB, the returned run's length
* will be the smallest power of 2 equal to or larger than the length.
* 2. If the length is larger than 128KB, the returned run's length will be
* the smallest multiple of 128KB that is larger than the length.
* 3. The returned run's length will be size-aligned up to 128KB.
*
* XXX-MJ the third rule isn't actually required, so this can just be a dumb
* bump allocator. Maybe there's some benefit to keeping large blocks aligned,
* so let's keep it for now and hope we don't get too much fragmentation.
* Alternately we could try to allocate all blocks of a certain size from the
* same metaslab.
*/
off_t
vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp)
{
off_t len;
int align, loc, minblksz, nbits;
minblksz = 1 << zfs->ashift;
len = roundup2(*lenp, minblksz);
assert(len != 0);
assert(len / minblksz <= INT_MAX);
if (len < MAXBLOCKSIZE) {
if ((len & (len - 1)) != 0)
len = (off_t)1 << flsll(len);
align = len / minblksz;
} else {
len = roundup2(len, MAXBLOCKSIZE);
align = MAXBLOCKSIZE / minblksz;
}
for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) {
bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits,
&loc);
if (loc == -1) {
errx(1, "failed to find %ju bytes of space",
(uintmax_t)len);
}
if ((loc & (align - 1)) == 0)
break;
}
assert(loc + nbits > loc);
bit_nset(zfs->spacemap, loc, loc + nbits - 1);
*lenp = len;
return ((off_t)loc << zfs->ashift);
}
static void
vdev_spacemap_init(zfs_opt_t *zfs)
{
uint64_t nbits;
assert(powerof2(zfs->mssize));
nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift;
if (nbits > INT_MAX) {
/*
* With the smallest block size of 512B, the limit on the image
* size is 2TB. That should be enough for anyone.
*/
errx(1, "image size is too large");
}
zfs->spacemapbits = (int)nbits;
zfs->spacemap = bit_alloc(zfs->spacemapbits);
if (zfs->spacemap == NULL)
err(1, "bitstring allocation failed");
}
void
vdev_spacemap_write(zfs_opt_t *zfs)
{
dnode_phys_t *objarr;
bitstr_t *spacemap;
uint64_t *objarrblk;
off_t smblksz, objarrblksz, objarrloc;
struct {
dnode_phys_t *dnode;
uint64_t dnid;
off_t loc;
} *sma;
objarrblksz = sizeof(uint64_t) * zfs->mscount;
assert(objarrblksz <= MAXBLOCKSIZE);
objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz);
objarrblk = ecalloc(1, objarrblksz);
objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid);
objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT;
/*
* Use the smallest block size for space maps. The space allocation
* algorithm should aim to minimize the number of holes.
*/
smblksz = 1 << zfs->ashift;
/*
* First allocate dnodes and space for all of our space maps. No more
* space can be allocated from the vdev after this point.
*/
sma = ecalloc(zfs->mscount, sizeof(*sma));
for (uint64_t i = 0; i < zfs->mscount; i++) {
sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos,
DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER,
sizeof(space_map_phys_t), &sma[i].dnid);
sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz);
}
spacemap = zfs->spacemap;
zfs->spacemap = NULL;
/*
* Now that the set of allocated space is finalized, populate each space
* map and write it to the vdev.
*/
for (uint64_t i = 0; i < zfs->mscount; i++) {
space_map_phys_t *sm;
uint64_t alloc, length, *smblk;
int shift, startb, endb, srunb, erunb;
/*
* We only allocate a single block for this space map, but
* OpenZFS assumes that a space map object with sufficient bonus
* space supports histograms.
*/
sma[i].dnode->dn_nblkptr = 3;
sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT;
smblk = ecalloc(1, smblksz);
alloc = length = 0;
shift = zfs->msshift - zfs->ashift;
for (srunb = startb = i * (1 << shift),
endb = (i + 1) * (1 << shift);
srunb < endb; srunb = erunb) {
uint64_t runlen, runoff;
/* Find a run of allocated space. */
bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb);
if (srunb == -1 || srunb >= endb)
break;
bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb);
if (erunb == -1 || erunb > endb)
erunb = endb;
/*
* The space represented by [srunb, erunb) has been
* allocated. Add a record to the space map to indicate
* this. Run offsets are relative to the beginning of
* the metaslab.
*/
runlen = erunb - srunb;
runoff = srunb - startb;
assert(length * sizeof(uint64_t) < (uint64_t)smblksz);
smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) |
SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0);
smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) |
SM2_OFFSET_ENCODE(runoff);
alloc += runlen << zfs->ashift;
length += 2;
}
sm = DN_BONUS(sma[i].dnode);
sm->smp_length = length * sizeof(uint64_t);
sm->smp_alloc = alloc;
vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz,
sma[i].loc);
free(smblk);
/* Record this space map in the space map object array. */
objarrblk[i] = sma[i].dnid;
}
/*
* All of the space maps are written, now write the object array.
*/
vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc);
free(objarrblk);
assert(zfs->spacemap == NULL);
free(spacemap);
free(sma);
}
void
vdev_init(zfs_opt_t *zfs, const char *image)
{
assert(zfs->ashift >= MINBLOCKSHIFT);
zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644);
if (zfs->fd == -1)
err(1, "Can't open `%s' for writing", image);
if (ftruncate(zfs->fd, zfs->vdevsize) != 0)
err(1, "Failed to extend image file `%s'", image);
vdev_spacemap_init(zfs);
}
void
vdev_fini(zfs_opt_t *zfs)
{
assert(zfs->spacemap == NULL);
if (zfs->fd != -1) {
if (close(zfs->fd) != 0)
err(1, "close");
zfs->fd = -1;
}
}

551
usr.sbin/makefs/zfs/zap.c Normal file
View File

@ -0,0 +1,551 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2022 The FreeBSD Foundation
*
* This software was developed by Mark Johnston under sponsorship from
* the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/types.h>
#include <sys/endian.h>
#include <assert.h>
#include <stddef.h>
#include <string.h>
#include <util.h>
#include "makefs.h"
#include "zfs.h"
typedef struct zfs_zap_entry {
char *name; /* entry key, private copy */
uint64_t hash; /* key hash */
union {
uint8_t *valp;
uint16_t *val16p;
uint32_t *val32p;
uint64_t *val64p;
}; /* entry value, an integer array */
uint64_t val64; /* embedded value for a common case */
size_t intsz; /* array element size; 1, 2, 4 or 8 */
size_t intcnt; /* array size */
STAILQ_ENTRY(zfs_zap_entry) next;
} zfs_zap_entry_t;
struct zfs_zap {
STAILQ_HEAD(, zfs_zap_entry) kvps;
uint64_t hashsalt; /* key hash input */
unsigned long kvpcnt; /* number of key-value pairs */
unsigned long chunks; /* count of chunks needed for fat ZAP */
bool micro; /* can this be a micro ZAP? */
dnode_phys_t *dnode; /* backpointer */
zfs_objset_t *os; /* backpointer */
};
static uint16_t
zap_entry_chunks(zfs_zap_entry_t *ent)
{
return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) +
howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES));
}
static uint64_t
zap_hash(uint64_t salt, const char *name)
{
static uint64_t crc64_table[256];
const uint64_t crc64_poly = 0xC96C5795D7870F42UL;
const uint8_t *cp;
uint64_t crc;
uint8_t c;
assert(salt != 0);
if (crc64_table[128] == 0) {
for (int i = 0; i < 256; i++) {
uint64_t *t;
t = crc64_table + i;
*t = i;
for (int j = 8; j > 0; j--)
*t = (*t >> 1) ^ (-(*t & 1) & crc64_poly);
}
}
assert(crc64_table[128] == crc64_poly);
for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++)
crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF];
/*
* Only use 28 bits, since we need 4 bits in the cookie for the
* collision differentiator. We MUST use the high bits, since
* those are the ones that we first pay attention to when
* choosing the bucket.
*/
crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
return (crc);
}
zfs_zap_t *
zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode)
{
zfs_zap_t *zap;
zap = ecalloc(1, sizeof(*zap));
STAILQ_INIT(&zap->kvps);
zap->hashsalt = ((uint64_t)random() << 32) | random();
zap->micro = true;
zap->kvpcnt = 0;
zap->chunks = 0;
zap->dnode = dnode;
zap->os = os;
return (zap);
}
void
zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt,
const uint8_t *val)
{
zfs_zap_entry_t *ent;
assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8);
assert(strlen(name) + 1 <= ZAP_MAXNAMELEN);
assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN);
ent = ecalloc(1, sizeof(*ent));
ent->name = estrdup(name);
ent->hash = zap_hash(zap->hashsalt, ent->name);
ent->intsz = intsz;
ent->intcnt = intcnt;
if (intsz == sizeof(uint64_t) && intcnt == 1) {
/*
* Micro-optimization to elide a memory allocation in that most
* common case where this is a directory entry.
*/
ent->val64p = &ent->val64;
} else {
ent->valp = ecalloc(intcnt, intsz);
}
memcpy(ent->valp, val, intcnt * intsz);
zap->kvpcnt++;
zap->chunks += zap_entry_chunks(ent);
STAILQ_INSERT_TAIL(&zap->kvps, ent, next);
if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) ||
strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX))
zap->micro = false;
}
void
zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val)
{
zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val);
}
void
zap_add_string(zfs_zap_t *zap, const char *name, const char *val)
{
zap_add(zap, name, 1, strlen(val) + 1, val);
}
bool
zap_entry_exists(zfs_zap_t *zap, const char *name)
{
zfs_zap_entry_t *ent;
STAILQ_FOREACH(ent, &zap->kvps, next) {
if (strcmp(ent->name, name) == 0)
return (true);
}
return (false);
}
static void
zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap)
{
dnode_phys_t *dnode;
zfs_zap_entry_t *ent;
mzap_phys_t *mzap;
mzap_ent_phys_t *ment;
off_t bytes, loc;
memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
mzap = (mzap_phys_t *)&zfs->filebuf[0];
mzap->mz_block_type = ZBT_MICRO;
mzap->mz_salt = zap->hashsalt;
mzap->mz_normflags = 0;
bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment);
assert(bytes <= (off_t)MZAP_MAX_BLKSZ);
ment = &mzap->mz_chunk[0];
STAILQ_FOREACH(ent, &zap->kvps, next) {
memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt);
ment->mze_cd = 0; /* XXX-MJ */
strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name));
ment++;
}
loc = objset_space_alloc(zfs, zap->os, &bytes);
dnode = zap->dnode;
dnode->dn_maxblkid = 0;
dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT;
dnode->dn_flags = DNODE_FLAG_USED_BYTES;
vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc);
}
/*
* Write some data to the fat ZAP leaf chunk starting at index "li".
*
* Note that individual integers in the value may be split among consecutive
* leaves.
*/
static void
zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz,
const uint8_t *val)
{
struct zap_leaf_array *la;
assert(sz <= ZAP_MAXVALUELEN);
for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) {
n = MIN(resid, ZAP_LEAF_ARRAY_BYTES);
la = &ZAP_LEAF_CHUNK(l, li).l_array;
assert(la->la_type == ZAP_CHUNK_FREE);
la->la_type = ZAP_CHUNK_ARRAY;
memcpy(la->la_array, val, n);
la->la_next = li + 1;
}
la->la_next = 0xffff;
}
/*
* Find the shortest hash prefix length which lets us distribute keys without
* overflowing a leaf block. This is not (space) optimal, but is simple, and
* directories large enough to overflow a single 128KB leaf block are uncommon.
*/
static unsigned int
zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l)
{
zfs_zap_entry_t *ent;
unsigned int prefixlen;
if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) {
/*
* All chunks will fit in a single leaf block.
*/
return (0);
}
for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) {
uint32_t *leafchunks;
leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks));
STAILQ_FOREACH(ent, &zap->kvps, next) {
uint64_t li;
uint16_t chunks;
li = ZAP_HASH_IDX(ent->hash, prefixlen);
chunks = zap_entry_chunks(ent);
if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) {
/*
* Not enough space, grow the prefix and retry.
*/
break;
}
leafchunks[li] += chunks;
}
free(leafchunks);
if (ent == NULL) {
/*
* Everything fits, we're done.
*/
break;
}
}
/*
* If this fails, then we need to expand the pointer table. For now
* this situation is unhandled since it is hard to trigger.
*/
assert(prefixlen < (unsigned int)l->l_bs);
return (prefixlen);
}
/*
* Initialize a fat ZAP leaf block.
*/
static void
zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen)
{
zap_leaf_phys_t *leaf;
leaf = l->l_phys;
leaf->l_hdr.lh_block_type = ZBT_LEAF;
leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
leaf->l_hdr.lh_prefix = prefix;
leaf->l_hdr.lh_prefix_len = prefixlen;
/* Initialize the leaf hash table. */
assert(leaf->l_hdr.lh_nfree < 0xffff);
memset(leaf->l_hash, 0xff,
ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash));
/* Initialize the leaf chunks. */
for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
struct zap_leaf_free *lf;
lf = &ZAP_LEAF_CHUNK(l, i).l_free;
lf->lf_type = ZAP_CHUNK_FREE;
if (i + 1 == ZAP_LEAF_NUMCHUNKS(l))
lf->lf_next = 0xffff;
else
lf->lf_next = i + 1;
}
}
static void
zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap)
{
struct dnode_cursor *c;
zap_leaf_t l;
zap_phys_t *zaphdr;
struct zap_table_phys *zt;
zfs_zap_entry_t *ent;
dnode_phys_t *dnode;
uint8_t *leafblks;
uint64_t lblkcnt, *ptrhasht;
off_t loc, blksz;
size_t blkshift;
unsigned int prefixlen;
int ptrcnt;
/*
* For simplicity, always use the largest block size. This should be ok
* since most directories will be micro ZAPs, but it's space inefficient
* for small ZAPs and might need to be revisited.
*/
blkshift = MAXBLOCKSHIFT;
blksz = (off_t)1 << blkshift;
/*
* Embedded pointer tables give up to 8192 entries. This ought to be
* enough for anything except massive directories.
*/
ptrcnt = (blksz / 2) / sizeof(uint64_t);
memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
zaphdr = (zap_phys_t *)&zfs->filebuf[0];
zaphdr->zap_block_type = ZBT_HEADER;
zaphdr->zap_magic = ZAP_MAGIC;
zaphdr->zap_num_entries = zap->kvpcnt;
zaphdr->zap_salt = zap->hashsalt;
l.l_bs = blkshift;
l.l_phys = NULL;
zt = &zaphdr->zap_ptrtbl;
zt->zt_blk = 0;
zt->zt_numblks = 0;
zt->zt_shift = flsll(ptrcnt) - 1;
zt->zt_nextblk = 0;
zt->zt_blks_copied = 0;
/*
* How many leaf blocks do we need? Initialize them and update the
* header.
*/
prefixlen = zap_fat_write_prefixlen(zap, &l);
lblkcnt = 1 << prefixlen;
leafblks = ecalloc(lblkcnt, blksz);
for (unsigned int li = 0; li < lblkcnt; li++) {
l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
zap_fat_write_leaf_init(&l, li, prefixlen);
}
zaphdr->zap_num_leafs = lblkcnt;
zaphdr->zap_freeblk = lblkcnt + 1;
/*
* For each entry, figure out which leaf block it belongs to based on
* the upper bits of its hash, allocate chunks from that leaf, and fill
* them out.
*/
ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2);
STAILQ_FOREACH(ent, &zap->kvps, next) {
struct zap_leaf_entry *le;
uint16_t *lptr;
uint64_t hi, li;
uint16_t namelen, nchunks, nnamechunks, nvalchunks;
hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift);
li = ZAP_HASH_IDX(ent->hash, prefixlen);
assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1);
ptrhasht[hi] = li + 1;
l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
namelen = strlen(ent->name) + 1;
/*
* How many leaf chunks do we need for this entry?
*/
nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES);
nvalchunks = howmany(ent->intcnt,
ZAP_LEAF_ARRAY_BYTES / ent->intsz);
nchunks = 1 + nnamechunks + nvalchunks;
/*
* Allocate a run of free leaf chunks for this entry,
* potentially extending a hash chain.
*/
assert(l.l_phys->l_hdr.lh_nfree >= nchunks);
l.l_phys->l_hdr.lh_nfree -= nchunks;
l.l_phys->l_hdr.lh_nentries++;
lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash);
while (*lptr != 0xffff) {
assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l));
le = ZAP_LEAF_ENTRY(&l, *lptr);
assert(le->le_type == ZAP_CHUNK_ENTRY);
le->le_cd++;
lptr = &le->le_next;
}
*lptr = l.l_phys->l_hdr.lh_freelist;
l.l_phys->l_hdr.lh_freelist += nchunks;
assert(l.l_phys->l_hdr.lh_freelist <=
ZAP_LEAF_NUMCHUNKS(&l));
if (l.l_phys->l_hdr.lh_freelist ==
ZAP_LEAF_NUMCHUNKS(&l))
l.l_phys->l_hdr.lh_freelist = 0xffff;
/*
* Integer values must be stored in big-endian format.
*/
switch (ent->intsz) {
case 1:
break;
case 2:
for (uint16_t *v = ent->val16p;
v - ent->val16p < (ptrdiff_t)ent->intcnt;
v++)
*v = htobe16(*v);
break;
case 4:
for (uint32_t *v = ent->val32p;
v - ent->val32p < (ptrdiff_t)ent->intcnt;
v++)
*v = htobe32(*v);
break;
case 8:
for (uint64_t *v = ent->val64p;
v - ent->val64p < (ptrdiff_t)ent->intcnt;
v++)
*v = htobe64(*v);
break;
default:
assert(0);
}
/*
* Finally, write out the leaf chunks for this entry.
*/
le = ZAP_LEAF_ENTRY(&l, *lptr);
assert(le->le_type == ZAP_CHUNK_FREE);
le->le_type = ZAP_CHUNK_ENTRY;
le->le_next = 0xffff;
le->le_name_chunk = *lptr + 1;
le->le_name_numints = namelen;
le->le_value_chunk = *lptr + 1 + nnamechunks;
le->le_value_intlen = ent->intsz;
le->le_value_numints = ent->intcnt;
le->le_hash = ent->hash;
zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name);
zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks,
ent->intcnt * ent->intsz, ent->valp);
}
/*
* Initialize unused slots of the pointer table.
*/
for (int i = 0; i < ptrcnt; i++)
if (ptrhasht[i] == 0)
ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1;
/*
* Write the whole thing to disk.
*/
dnode = zap->dnode;
dnode->dn_nblkptr = 1;
dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
dnode->dn_maxblkid = lblkcnt + 1;
dnode->dn_flags = DNODE_FLAG_USED_BYTES;
c = dnode_cursor_init(zfs, zap->os, zap->dnode,
(lblkcnt + 1) * blksz, blksz);
loc = objset_space_alloc(zfs, zap->os, &blksz);
vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc,
dnode_cursor_next(zfs, c, 0));
for (uint64_t i = 0; i < lblkcnt; i++) {
loc = objset_space_alloc(zfs, zap->os, &blksz);
vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz,
blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz));
}
dnode_cursor_finish(zfs, c);
free(leafblks);
}
void
zap_write(zfs_opt_t *zfs, zfs_zap_t *zap)
{
zfs_zap_entry_t *ent;
if (zap->micro) {
zap_micro_write(zfs, zap);
} else {
assert(!STAILQ_EMPTY(&zap->kvps));
assert(zap->kvpcnt > 0);
zap_fat_write(zfs, zap);
}
while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) {
STAILQ_REMOVE_HEAD(&zap->kvps, next);
if (ent->val64p != &ent->val64)
free(ent->valp);
free(ent->name);
free(ent);
}
free(zap);
}

167
usr.sbin/makefs/zfs/zfs.h Normal file
View File

@ -0,0 +1,167 @@
/*-
* SPDX-License-Identifier: BSD-2-Clause-FreeBSD
*
* Copyright (c) 2022 The FreeBSD Foundation
*
* This software was developed by Mark Johnston under sponsorship from
* the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _MAKEFS_ZFS_H_
#define _MAKEFS_ZFS_H_
#include <sys/types.h>
#include <sys/queue.h>
#include <bitstring.h>
#include <stdbool.h>
#include "makefs.h"
#include "zfs/nvlist.h"
#define ASSERT assert
#include "zfs/zfsimpl.h"
#define MAXBLOCKSHIFT 17 /* 128KB */
#define MAXBLOCKSIZE ((off_t)(1 << MAXBLOCKSHIFT))
_Static_assert(MAXBLOCKSIZE == SPA_OLDMAXBLOCKSIZE, "");
#define MINBLOCKSHIFT 9 /* 512B */
#define MINBLOCKSIZE ((off_t)(1 << MINBLOCKSHIFT))
_Static_assert(MINBLOCKSIZE == SPA_MINBLOCKSIZE, "");
#define MINDEVSIZE ((off_t)SPA_MINDEVSIZE)
/* All data was written in this transaction group. */
#define TXG 4
typedef struct zfs_dsl_dataset zfs_dsl_dataset_t;
typedef struct zfs_dsl_dir zfs_dsl_dir_t;
typedef struct zfs_objset zfs_objset_t;
typedef struct zfs_zap zfs_zap_t;
struct dataset_desc {
char *params;
STAILQ_ENTRY(dataset_desc) next;
};
typedef struct {
bool nowarn;
/* I/O buffer, just for convenience. */
char filebuf[MAXBLOCKSIZE];
/* Pool parameters. */
const char *poolname;
char *rootpath; /* implicit mount point prefix */
char *bootfs; /* bootable dataset, pool property */
int ashift; /* vdev block size */
uint64_t mssize; /* metaslab size */
STAILQ_HEAD(, dataset_desc) datasetdescs; /* non-root dataset descrs */
/* Pool state. */
uint64_t poolguid; /* pool and root vdev GUID */
zfs_zap_t *poolprops;
/* MOS state. */
zfs_objset_t *mos; /* meta object set */
uint64_t objarrid; /* space map object array */
/* DSL state. */
zfs_dsl_dir_t *rootdsldir; /* root DSL directory */
zfs_dsl_dataset_t *rootds;
zfs_dsl_dir_t *origindsldir; /* $ORIGIN */
zfs_dsl_dataset_t *originds;
zfs_dsl_dataset_t *snapds;
zfs_zap_t *cloneszap;
zfs_dsl_dir_t *freedsldir; /* $FREE */
zfs_dsl_dir_t *mosdsldir; /* $MOS */
/* vdev state. */
int fd; /* vdev disk fd */
uint64_t vdevguid; /* disk vdev GUID */
off_t vdevsize; /* vdev size, including labels */
off_t asize; /* vdev size, excluding labels */
bitstr_t *spacemap; /* space allocation tracking */
int spacemapbits; /* one bit per ashift-sized block */
uint64_t msshift; /* log2(metaslab size) */
uint64_t mscount; /* number of metaslabs for this vdev */
} zfs_opt_t;
/* dsl.c */
void dsl_init(zfs_opt_t *);
const char *dsl_dir_fullname(const zfs_dsl_dir_t *);
uint64_t dsl_dir_id(zfs_dsl_dir_t *);
uint64_t dsl_dir_dataset_id(zfs_dsl_dir_t *);
void dsl_dir_foreach(zfs_opt_t *, zfs_dsl_dir_t *,
void (*)(zfs_opt_t *, zfs_dsl_dir_t *, void *), void *);
int dsl_dir_get_canmount(zfs_dsl_dir_t *, uint64_t *);
char *dsl_dir_get_mountpoint(zfs_opt_t *, zfs_dsl_dir_t *);
bool dsl_dir_has_dataset(zfs_dsl_dir_t *);
bool dsl_dir_dataset_has_objset(zfs_dsl_dir_t *);
void dsl_dir_dataset_write(zfs_opt_t *, zfs_objset_t *, zfs_dsl_dir_t *);
void dsl_dir_size_set(zfs_dsl_dir_t *, uint64_t);
void dsl_write(zfs_opt_t *);
/* fs.c */
void fs_build(zfs_opt_t *, int, fsnode *);
/* objset.c */
zfs_objset_t *objset_alloc(zfs_opt_t *zfs, uint64_t type);
off_t objset_space_alloc(zfs_opt_t *, zfs_objset_t *, off_t *);
dnode_phys_t *objset_dnode_alloc(zfs_objset_t *, uint8_t, uint64_t *);
dnode_phys_t *objset_dnode_bonus_alloc(zfs_objset_t *, uint8_t, uint8_t,
uint16_t, uint64_t *);
dnode_phys_t *objset_dnode_lookup(zfs_objset_t *, uint64_t);
void objset_root_blkptr_copy(const zfs_objset_t *, blkptr_t *);
uint64_t objset_space(const zfs_objset_t *);
void objset_write(zfs_opt_t *zfs, zfs_objset_t *os);
/* vdev.c */
void vdev_init(zfs_opt_t *, const char *);
off_t vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp);
void vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
blkptr_t *bp);
void vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level,
uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp);
void vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data,
off_t sz, off_t loc);
void vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp);
void vdev_spacemap_write(zfs_opt_t *);
void vdev_fini(zfs_opt_t *zfs);
/* zap.c */
zfs_zap_t *zap_alloc(zfs_objset_t *, dnode_phys_t *);
void zap_add(zfs_zap_t *, const char *, size_t, size_t, const uint8_t *);
void zap_add_uint64(zfs_zap_t *, const char *, uint64_t);
void zap_add_string(zfs_zap_t *, const char *, const char *);
bool zap_entry_exists(zfs_zap_t *, const char *);
void zap_write(zfs_opt_t *, zfs_zap_t *);
/* zfs.c */
struct dnode_cursor *dnode_cursor_init(zfs_opt_t *, zfs_objset_t *,
dnode_phys_t *, off_t, off_t);
blkptr_t *dnode_cursor_next(zfs_opt_t *, struct dnode_cursor *, off_t);
void dnode_cursor_finish(zfs_opt_t *, struct dnode_cursor *);
#endif /* !_MAKEFS_ZFS_H_ */